{"library":"unstructured","type":"library","category":null,"description":"Unstructured is an open-source Python library designed to simplify the ingestion and preprocessing of diverse unstructured data formats, including PDFs, HTML, Word documents, and images. It provides modular functions for partitioning, cleaning, and staging data, primarily optimizing data workflows for Large Language Models (LLMs). The library is actively maintained with frequent releases, currently at version 0.22.18.","language":"python","status":"active","version":"0.22.18","tags":["NLP","document processing","information extraction","OCR","LLM tooling","data preprocessing"],"last_verified":"Wed May 20","install":[{"cmd":"pip install unstructured","imports":["from unstructured.partition.auto import partition","from unstructured.partition.pdf import partition_pdf","from unstructured.staging.base import elements_to_json"]},{"cmd":"pip install \"unstructured[all-docs]\"","imports":[]},{"cmd":"pip install \"unstructured[pdf,docx]\"","imports":[]}],"homepage":"https://unstructured.io","github":"https://github.com/Unstructured-IO/unstructured","docs":null,"changelog":null,"pypi":"https://pypi.org/project/unstructured/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":22,"avg_install_s":46.6,"avg_import_s":2.39,"wheel_type":"sdist"},"url":"https://checklist.day/v1/registry/unstructured/compatibility"}}