{"library":"pymupdf4llm","type":"library","category":null,"description":"PyMuPDF4LLM (also aliased as `pdf4llm`) is a Python library built on PyMuPDF, specialized in converting PDF documents into clean, structured data formats like Markdown, JSON, and plain text, specifically optimized for Large Language Model (LLM) and Retrieval-Augmented Generation (RAG) environments. It includes layout analysis, automatic OCR for scanned pages, and supports multi-column layouts and image extraction. The library is actively maintained and frequently updated, with the current stable version being 1.27.2.2.","language":"python","status":"active","version":"1.27.2.2","tags":["pdf","llm","rag","markdown","json","text-extraction","ocr","document-processing"],"install":[{"cmd":"pip install -U pymupdf4llm","imports":["import pymupdf4llm\nmd_text = pymupdf4llm.to_markdown(\"input.pdf\")","import pymupdf4llm\njson_text = pymupdf4llm.to_json(\"input.pdf\")","import pymupdf4llm\nplain_text = pymupdf4llm.to_text(\"input.pdf\")"]},{"cmd":"pip install -U 'pymupdf4llm[ocr,layout]'","imports":[]}],"homepage":null,"github":"https://github.com/pymupdf/pymupdf4llm","docs":"https://pymupdf.readthedocs.io/","changelog":"https://pymupdf.readthedocs.io/en/latest/changes.html","pypi":"https://pypi.org/project/pymupdf4llm/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":75,"avg_install_s":7.5,"avg_import_s":2.2,"wheel_type":"wheel"},"url":"https://checklist.day/v1/registry/pymupdf4llm/compatibility"},"provenance":{"verified_status":"install_fail","verified_at":"Sun Jun 28","last_verified":"Sun Jun 28","next_check":"Sun Jul 05","install_tag":"draft"}}