{"library":"textract","type":"library","category":null,"description":"textract is a Python library designed to extract text from a wide variety of document formats, including PDFs, Word documents, images (via OCR), and audio files, providing a unified interface. The current stable version is 1.6.5, released in March 2022. While releases aren't on a strict schedule, the project is actively maintained with bug fixes and feature additions.","language":"python","status":"active","version":"1.6.5","tags":["text extraction","document processing","OCR","PDF","DOCX","TXT","email","audio","unstructured data"],"last_verified":"Sun May 24","install":[{"cmd":"pip install textract","imports":["import textract\ntext = textract.process('path/to/file.extension')"]}],"homepage":null,"github":"https://github.com/deanmalmgren/textract","docs":null,"changelog":null,"pypi":"https://pypi.org/project/textract/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":100,"avg_install_s":7.8,"avg_import_s":0.02,"wheel_type":"sdist"},"url":"https://checklist.day/v1/registry/textract/compatibility"}}