{"id":7344,"library":"konoha","title":"Konoha: Japanese Tokenizer Wrapper","description":"Konoha is a Python library (v5.7.0) that provides a unified, easy-to-use interface for various Japanese tokenizers, including MeCab, Sudachi, and Sentencepiece. It allows developers to seamlessly switch between different tokenizers and also offers rule-based tokenizers (whitespace, character) and a sentence splitter. The library is actively maintained with its latest release in March 2026.","status":"active","version":"5.7.0","language":"en","source_language":"en","source_url":"https://github.com/himkt/konoha","tags":["japanese","nlp","tokenization","tokenizer","text-processing"],"install":[{"cmd":"pip install 'konoha[all]'","lang":"bash","label":"Recommended: Install with all supported tokenizers"},{"cmd":"pip install 'konoha[mecab]'","lang":"bash","label":"Install with a specific tokenizer (e.g., MeCab)"},{"cmd":"pip install konoha","lang":"bash","label":"Minimal install (only sentence splitter)"}],"dependencies":[{"reason":"Required for MeCab tokenizer functionality.","package":"mecab-python3","optional":true},{"reason":"Required for Sudachi tokenizer functionality.","package":"sudachipy","optional":true},{"reason":"Required for Sudachi tokenizer functionality.","package":"sudachidict_core","optional":true},{"reason":"Required for Janome tokenizer functionality.","package":"janome","optional":true},{"reason":"Required for Sentencepiece tokenizer functionality.","package":"sentencepiece","optional":true},{"reason":"Required for Nagisa tokenizer functionality.","package":"nagisa","optional":true},{"reason":"Required for AllenNLP integration via 'all_with_integrations' extra.","package":"allennlp","optional":true}],"imports":[{"symbol":"WordTokenizer","correct":"from konoha import WordTokenizer"},{"symbol":"SentenceTokenizer","correct":"from konoha import SentenceTokenizer"}],"quickstart":{"code":"from konoha import WordTokenizer\n\nsentence = '自然言語処理を勉強しています'\n\n# Initialize with a supported tokenizer (e.g., MeCab)\n# Ensure 'konoha[mecab]' or 'konoha[all]' is installed\ntokenizer = WordTokenizer('MeCab')\n\ntokens = tokenizer.tokenize(sentence)\nprint([token.surface for token in tokens])","lang":"python","description":"Demonstrates basic word-level tokenization of a Japanese sentence using the `WordTokenizer` with MeCab. Ensure the necessary tokenizer is installed as an extra."},"warnings":[{"fix":"Use `pip install 'konoha[all]'` or `pip install 'konoha[<tokenizer_name>]'` to include desired tokenizer dependencies.","message":"Installing `konoha` without specifying extras (e.g., `pip install konoha`) will only install the sentence splitter, not any word tokenizers. To use tokenizers like MeCab or Sudachi, you must install `konoha` with the corresponding extra (e.g., `konoha[mecab]`) or `konoha[all]` for all supported tokenizers.","severity":"gotcha","affected_versions":"All versions"},{"fix":"Refer to the latest documentation or GitHub README for updated API paths when using the Konoha Docker image. For example, check release notes at `https://github.com/himkt/konoha/releases/tag/v4.6.4`.","message":"The API endpoint paths for the Docker quickstart (e.g., `/api/v1/tokenize`) changed in v4.6.4. Older `curl` commands or Docker configurations might fail.","severity":"breaking","affected_versions":">=4.6.4"},{"fix":"Always pass the `model_path` argument when initializing `WordTokenizer('Sentencepiece', model_path=\"your/model.spm\")`.","message":"When using the `Sentencepiece` tokenizer, you must provide a valid `model_path` argument to `WordTokenizer`. Omitting it will result in an error or unexpected behavior.","severity":"gotcha","affected_versions":"All versions"}],"env_vars":null,"last_verified":"2026-04-16T00:00:00.000Z","next_check":"2026-07-15T00:00:00.000Z","problems":[{"fix":"Install konoha with the mecab extra: `pip install 'konoha[mecab]'` or `pip install 'konoha[all]'`.","cause":"The MeCab tokenizer dependency (mecab-python3) was not installed along with konoha.","error":"ModuleNotFoundError: No module named 'MeCab'"},{"fix":"Install konoha with the sudachi extra: `pip install 'konoha[sudachi]'` or `pip install 'konoha[all]'`.","cause":"The Sudachi tokenizer dependency (sudachipy and sudachidict_core) was not installed.","error":"RuntimeError: 'sudachipy' is not installed. Please install it with 'pip install sudachipy'"},{"fix":"Provide the path to your Sentencepiece model file: `WordTokenizer('Sentencepiece', model_path=\"path/to/your/model.spm\")`.","cause":"Attempted to initialize `WordTokenizer('Sentencepiece')` without providing the `model_path` argument.","error":"TypeError: __init__() missing 1 required positional argument: 'model_path'"}]}