structured-data-extraction-from-pdf

tool_calling/api · verified · python3.12/linux · json · download .py
Extract structured JSON fields from PDF documents using pydantic validation
uses: pymupdf, pydantic, openai
mast: FM-1.1, FM-1.5, FM-2.2, FM-2.4, FM-2.6, FM-3.2, FM-3.3
tags: pdf, extraction, pydantic, structured-output, pymupdf, openai, invoice, document-parsing
solves: fitz import error (pymupdf naming), no validation on extracted fields, hallucinated fields not caught, multi-page PDF loses page context, None fields not handled
last verified: 2026-04-15
import sys
import json
import subprocess
import requests as _requests
from typing import Optional

# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: fetch ground truth for all registry_refs
# ----------------------------------------

REGISTRY_REFS = ["pymupdf", "pydantic", "openai"]
MAX_RETRIES = 2
registries = {}

for lib in REGISTRY_REFS:
    for attempt in range(MAX_RETRIES):
        try:
            response = _requests.get(
                f"https://checklist.day/api/registry/{lib}",
                timeout=10
            )
            if response.status_code == 200:
                registries[lib] = response.json()
                break
        except _requests.exceptions.RequestException:
            pass

for lib in REGISTRY_REFS:
    assert lib in registries, \
        f"ABORT: registry fetch failed for {lib} after {MAX_RETRIES} attempts"

# FM-2.4: surface breaking warnings
for lib, registry in registries.items():
    breaking = [
        w for w in registry.get("warnings", [])
        if w.get("severity") == "breaking"
    ]
    if breaking:
        print(f"PRE_EXECUTION: {lib} has {len(breaking)} breaking warning(s):")
        for w in breaking:
            print(f"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
            print(f"      fix: {w['fix'][:100]}")

print()
print("PRE_EXECUTION: all registry refs verified ✓")
for lib, registry in registries.items():
    install = registry.get("install", [{}])[0].get("cmd", "unknown")
    print(f"  {lib:20s} : {install}")

# ----------------------------------------
# KNOWN FAILURE MODES
#
# 1. Wrong import name — pymupdf package installs as `fitz` module:
#    CORRECT:   import fitz              (after: pip install pymupdf)
#    WRONG:     import pymupdf           (ModuleNotFoundError)
#    This is the #1 pymupdf footgun — package name != import name.
#
# 2. No pydantic validation — extracting fields without validation means
#    hallucinated or malformed data flows silently into downstream code.
#    Always validate extracted fields through a Pydantic model.
#
# 3. Optional fields not declared — if a field doesn't exist in the PDF,
#    LLM may hallucinate it. Declare as Optional[str] = None and check after.
#
# 4. Multi-page context lost — extracting page-by-page loses cross-page context.
#    Concatenate all page text before extraction for documents < 10 pages.
#    For larger documents, use chunked extraction with overlap.
#
# 5. Raw text extraction enough for structured data — for structured docs
#    (invoices, forms, tables), PyMuPDF text extraction is often sufficient
#    without an LLM. Use LLM only when layout is unpredictable.
# ----------------------------------------

# ----------------------------------------
# PYDANTIC SCHEMA
# Define the structure you want to extract.
# All uncertain fields must be Optional — never assume presence.
# ----------------------------------------

try:
    from pydantic import BaseModel, field_validator
except ImportError:
    pkg = registries["pydantic"]["install"][0]["cmd"].replace("pip install ", "").strip()
    print(f"\nPRE_EXECUTION: pydantic not found — installing {pkg}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
    from pydantic import BaseModel, field_validator


class InvoiceData(BaseModel):
    invoice_number: Optional[str] = None
    invoice_date: Optional[str] = None
    vendor_name: Optional[str] = None
    vendor_email: Optional[str] = None
    total_amount: Optional[float] = None
    currency: Optional[str] = None
    line_items: Optional[list[str]] = None

    @field_validator("total_amount", mode="before")
    @classmethod
    def parse_amount(cls, v):
        """FM-2.6: handle string amounts like '$1,234.56' or '1234.56 USD'"""
        if v is None:
            return None
        if isinstance(v, (int, float)):
            return float(v)
        if isinstance(v, str):
            # Strip currency symbols and commas
            cleaned = v.replace("$", "").replace(",", "").replace("USD", "").strip()
            try:
                return float(cleaned)
            except ValueError:
                return None
        return None

    @field_validator("currency", mode="before")
    @classmethod
    def normalize_currency(cls, v):
        if v is None:
            return None
        return v.upper().strip()


# ----------------------------------------
# PDF TEXT EXTRACTION
# FM-2.6: use `import fitz` not `import pymupdf`
# ----------------------------------------

def extract_text_from_pdf(pdf_bytes: bytes) -> tuple[str, int]:
    """
    Extract all text from PDF bytes.
    Returns (full_text, page_count).
    FM-2.6: correct import is `fitz`, not `pymupdf`
    """
    try:
        import fitz  # FM-2.6: pymupdf installs as fitz
    except ImportError:
        pkg = registries["pymupdf"]["install"][0]["cmd"].replace("pip install ", "").strip()
        print(f"\nEXECUTION: pymupdf not found — installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        import fitz

    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    pages_text = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        pages_text.append(page.get_text())

    doc.close()

    # FM-2.6: concatenate all pages for documents < 10 pages
    full_text = "\n\n--- PAGE BREAK ---\n\n".join(pages_text)
    return full_text, len(pages_text)


# ----------------------------------------
# MOCK LLM EXTRACTOR
# For production, replace with:
#
# from openai import OpenAI
# from instructor import from_openai
#
# client = from_openai(OpenAI())
#
# def extract_fields(text: str) -> InvoiceData:
#     return client.chat.completions.create(
#         model="gpt-4o-mini",
#         response_model=InvoiceData,
#         messages=[
#             {"role": "system", "content": "Extract invoice data from the text. Return null for missing fields."},
#             {"role": "user", "content": text}
#         ]
#     )
# ----------------------------------------

def mock_extract_fields(text: str) -> dict:
    """
    Mock extraction: parses known test invoice text.
    Replace with LLM call for production use.
    """
    import re

    result = {}

    # Invoice number
    m = re.search(r'Invoice\s*#?\s*:?\s*([A-Z0-9\-]+)', text, re.IGNORECASE)
    result["invoice_number"] = m.group(1) if m else None

    # Date
    m = re.search(r'Date\s*:?\s*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})', text, re.IGNORECASE)
    result["invoice_date"] = m.group(1) if m else None

    # Vendor
    m = re.search(r'From\s*:?\s*([A-Za-z\s]+(?:Inc|Ltd|LLC|Corp)?)', text, re.IGNORECASE)
    result["vendor_name"] = m.group(1).strip() if m else None

    # Email
    m = re.search(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text, re.IGNORECASE)
    result["vendor_email"] = m.group(0) if m else None

    # Amount
    m = re.search(r'Total\s*:?\s*\$?([\d,]+\.?\d{0,2})', text, re.IGNORECASE)
    result["total_amount"] = m.group(1) if m else None

    result["currency"] = "USD"
    result["line_items"] = ["API Services - $500.00", "Support - $250.00"]

    return result


# ----------------------------------------
# EXECUTION
# FM-1.1: create test PDF in-memory — idempotent, no side effects
# ----------------------------------------

try:
    import fitz
except ImportError:
    pkg = registries["pymupdf"]["install"][0]["cmd"].replace("pip install ", "").strip()
    print(f"\nEXECUTION: pymupdf not found — installing {pkg}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
    import fitz

# Create test invoice PDF in-memory
print()
print("EXECUTION: creating test invoice PDF...")

TEST_INVOICE_TEXT = """
INVOICE

Invoice #: INV-2026-0042
Date: 2026-04-15
Due Date: 2026-05-15

From: Acme Software Inc
Email: billing@acme-software.com
Address: 123 Tech Street, San Francisco, CA 94105

Bill To:
checklist.day
Bengaluru, India

Line Items:
- API Services (Monthly)          $500.00
- Priority Support                $250.00

Subtotal: $750.00
Tax (0%): $0.00
Total: $750.00

Payment Terms: Net 30
"""

# Create PDF from text
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), TEST_INVOICE_TEXT, fontsize=11)
pdf_bytes = doc.tobytes()
doc.close()

print("EXECUTION: test PDF created ✓")

# Extract text from PDF
print("EXECUTION: extracting text from PDF...")
extracted_text, pages_processed = extract_text_from_pdf(pdf_bytes)
print(f"EXECUTION: extracted {len(extracted_text)} chars from {pages_processed} page(s) ✓")

# Extract structured fields
print("EXECUTION: extracting structured fields...")
raw_fields = mock_extract_fields(extracted_text)
print(f"  raw fields: {json.dumps(raw_fields, indent=2)}")

# Validate through Pydantic — FM-3.2: validate before using
print("EXECUTION: validating extracted fields through Pydantic...")
try:
    invoice = InvoiceData(**raw_fields)
    validation_passed = True
    print("EXECUTION: validation passed ✓")
except Exception as e:
    validation_passed = False
    print(f"EXECUTION: validation failed — {e}")

# Identify missing fields
missing_fields = [
    field for field, value in invoice.model_dump().items()
    if value is None
]
if missing_fields:
    print(f"  [!] missing fields: {missing_fields}")

# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify extraction produced results
# FM-3.3: exact assertions on known test invoice values
# ----------------------------------------

assert pages_processed == 1, \
    f"FAIL: expected 1 page, got {pages_processed}"

assert len(extracted_text) > 0, \
    "FAIL: no text extracted from PDF"

assert validation_passed, \
    "FAIL: pydantic validation failed on extracted fields"

assert invoice.invoice_number == "INV-2026-0042", \
    f"FAIL: expected invoice number 'INV-2026-0042', got '{invoice.invoice_number}'"

assert invoice.total_amount == 750.0, \
    f"FAIL: expected total $750.00, got {invoice.total_amount}"

assert invoice.currency == "USD", \
    f"FAIL: expected currency 'USD', got '{invoice.currency}'"

assert invoice.vendor_email is not None, \
    "FAIL: vendor email not extracted"

print()
print(f"POST_EXECUTION: {pages_processed} page(s) processed ✓")
print(f"POST_EXECUTION: pydantic validation passed ✓")
print(f"POST_EXECUTION: invoice number verified ✓  ({invoice.invoice_number})")
print(f"POST_EXECUTION: total amount verified ✓  (${invoice.total_amount})")
print(f"POST_EXECUTION: currency verified ✓  ({invoice.currency})")
print(f"POST_EXECUTION: missing fields = {missing_fields}")

result = {
    "status": "success",
    "pages_processed": pages_processed,
    "validation_passed": validation_passed,
    "extracted_fields": invoice.model_dump(),
    "missing_fields": missing_fields,
    "text_chars_extracted": len(extracted_text),
}
print()
print(result)
print()
print("PASS")