{"id":28013,"library":"opendataloader-pdf","title":"OpenDataLoader PDF","description":"A Python wrapper for the opendataloader-pdf Java CLI that extracts structured content and metadata from PDFs, supporting accessibility tags, tables, headings, and strikethrough text. Current version 2.4.3, requires Python >=3.10, released every few months.","status":"active","version":"2.4.3","language":"python","source_language":"en","source_url":"https://github.com/opendataloader-project/opendataloader-pdf","tags":["pdf","extraction","accessibility","pdf-ua","java-wrapper"],"install":[{"cmd":"pip install opendataloader-pdf","lang":"bash","label":"Standard pip install"}],"dependencies":[{"reason":"Requires Java 11+ runtime to invoke the embedded CLI.","package":"java","optional":false}],"imports":[{"note":"Module is opendataloader_pdf, not opendataloader.","wrong":"from opendataloader import PDF","symbol":"OpenDataLoaderPDF","correct":"from opendataloader_pdf import OpenDataLoaderPDF"}],"quickstart":{"code":"from opendataloader_pdf import OpenDataLoaderPDF\n\nloader = OpenDataLoaderPDF(api_key=os.environ.get('API_KEY', ''))\nwith open('document.pdf', 'rb') as f:\n    result = loader.extract(f)\nprint(result['content'][:200])","lang":"python","description":"Initialize the extractor with an API key and extract content from a PDF file."},"warnings":[{"fix":"Set `hybrid_fallback=True` explicitly if you want the old fallback behavior.","message":"The `-–hybrid-fallback` default changed to `false` in v2.0.1, causing hybrid extraction to fail fast instead of falling back to rule-based extraction.","severity":"breaking","affected_versions":">=2.0.1"},{"fix":"Install Java 11+ and ensure `java` is on PATH.","message":"The library requires Java 11+ at runtime. If Java is missing or too old, extraction fails with a subprocess error.","severity":"gotcha","affected_versions":"all"},{"fix":"Use `output_format='text'` instead of `extract_text=True`.","message":"The old `–-extract-text` flag is deprecated in v2.3.0 in favor of `–-output-format text`.","severity":"deprecated","affected_versions":">=2.3.0"}],"env_vars":null,"last_verified":"2026-05-09T00:00:00.000Z","next_check":"2026-08-07T00:00:00.000Z","problems":[{"fix":"Update Java to JDK 17 or later.","cause":"Java version too old (< 17). The CLI requires Java 17+ as of v2.4.0.","error":"java.lang.UnsupportedClassVersionError: org/opendataloader/pdf/CLI has been compiled by a more recent version of the Java Runtime (class file version 61.0), this version of the Java Runtime only recognizes class file versions up to 55.0"},{"fix":"Set the `OPENDATALOADER_API_KEY` environment variable or pass `api_key` to the constructor.","cause":"Missing API key or environment variable.","error":"opendataloader_pdf.exceptions.LicenseError: No valid license found. Please set OPENDATALOADER_API_KEY or pass api_key."}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}