{"library":"curated-tokenizers","type":"library","category":null,"description":"Curated Tokenizers is a lightweight Python library by Explosion (creators of spaCy) that provides efficient and production-ready implementations of various piece tokenization algorithms, including Byte-Pair Encoding (BPE), WordPiece, and SentencePiece. It focuses on fast, reliable tokenization suitable for integrating into larger NLP pipelines. The library is currently at version 2.0.0, with an active but less frequent release cadence focused on performance and stability.","language":"python","status":"active","version":"2.0.0","tags":["tokenization","nlp","byte-pair-encoding","wordpiece","sentencepiece","explosion","spacy"],"last_verified":"Fri May 22","install":[{"cmd":"pip install curated-tokenizers","imports":["from curated_tokenizers import ByteBPEProcessor","from curated_tokenizers import WordPieceProcessor","from curated_tokenizers import SentencePieceProcessor"]},{"cmd":"pip install curated-tokenizers[sentencepiece]","imports":[]}],"homepage":null,"github":"https://github.com/explosion/curated-tokenizers","docs":null,"changelog":null,"pypi":"https://pypi.org/project/curated-tokenizers/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":40,"avg_install_s":2.2,"avg_import_s":0.12,"wheel_type":"wheel"},"url":"https://checklist.day/v1/registry/curated-tokenizers/compatibility"}}