{"id":24787,"library":"uniseg","title":"uniseg","description":"The uniseg library determines Unicode text segmentation boundaries, such as grapheme clusters, words, sentences, and line break opportunities, following the Unicode Standard Annex #29 and UAX #14. Current version is 0.10.1, requires Python >=3.9, and released with no fixed cadence.","status":"active","version":"0.10.1","language":"python","source_language":"en","source_url":"https://github.com/knslbrn/uniseg","tags":["unicode","segmentation","grapheme","word-boundary","line-break","text-processing"],"install":[{"cmd":"pip install uniseg","lang":"bash","label":"Install from PyPI"}],"dependencies":[],"imports":[{"note":"","wrong":"","symbol":"grapheme_clusters","correct":"from uniseg import grapheme_clusters"},{"note":"","wrong":"","symbol":"word_segment","correct":"from uniseg import word_segment"},{"note":"","wrong":"","symbol":"sentences","correct":"from uniseg import sentences"},{"note":"","wrong":"","symbol":"line_break","correct":"from uniseg import line_break"},{"note":"","wrong":"","symbol":"GraphemeCluster","correct":"from uniseg import GraphemeCluster"}],"quickstart":{"code":"from uniseg import grapheme_clusters, word_segment, sentences, line_break\n\ntext = \"Hello World! 🌍\"\nprint(\"Grapheme clusters:\", list(grapheme_clusters(text)))\nprint(\"Words:\", list(word_segment(text)))\nprint(\"Sentences:\", list(sentences(text)))\nprint(\"Line breaks:\", list(line_break(text)))","lang":"python","description":"Basic usage: iterate over Unicode segment boundaries."},"warnings":[{"fix":"Wrap calls in list() if you need to index or reuse results.","message":"Functions return iterators, not lists. Call list() to inspect or store.","severity":"gotcha","affected_versions":"all"},{"fix":"Check uniseg.UNICODE_VERSION for the Unicode version used.","message":"Grapheme cluster and word segmentation depend on Unicode version bundled with library. Ensure system Unicode data is not mixed.","severity":"gotcha","affected_versions":"all"},{"fix":"Filter results if only alphanumeric words are needed.","message":"The word_segment function returns segments as strings including punctuation and spaces. Do not assume it returns only words.","severity":"gotcha","affected_versions":"all"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Use list() to convert: list(grapheme_clusters(text))[0]","cause":"Calling index or slice on the iterator returned by segmentation functions.","error":"TypeError: 'generator' object is not subscriptable"},{"fix":"Upgrade to 0.10.1 via pip install --upgrade uniseg. Check import is exact: from uniseg import grapheme_clusters","cause":"Using an older version of uniseg that might have different function names or import path.","error":"AttributeError: module 'uniseg' has no attribute 'grapheme_clusters'"},{"fix":"Set environment variable PYTHONIOENCODING=utf-8, or encode output manually.","cause":"Printing Unicode characters to a terminal that does not support UTF-8.","error":"UnicodeEncodeError: 'charmap' codec can't encode character"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}