structured-data-extraction-from-pdf
Extract structured JSON fields from PDF documents using pydantic validation
import sys
import json
import subprocess
import requests as _requests
from typing import Optional
# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: fetch ground truth for all registry_refs
# ----------------------------------------
REGISTRY_REFS = ["pymupdf", "pydantic", "openai"]
MAX_RETRIES = 2
registries = {}
for lib in REGISTRY_REFS:
for attempt in range(MAX_RETRIES):
try:
response = _requests.get(
f"https://checklist.day/api/registry/{lib}",
timeout=10
)
if response.status_code == 200:
registries[lib] = response.json()
break
except _requests.exceptions.RequestException:
pass
for lib in REGISTRY_REFS:
assert lib in registries, \
f"ABORT: registry fetch failed for {lib} after {MAX_RETRIES} attempts"
# FM-2.4: surface breaking warnings
for lib, registry in registries.items():
breaking = [
w for w in registry.get("warnings", [])
if w.get("severity") == "breaking"
]
if breaking:
print(f"PRE_EXECUTION: {lib} has {len(breaking)} breaking warning(s):")
for w in breaking:
print(f" [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
print(f" fix: {w['fix'][:100]}")
print()
print("PRE_EXECUTION: all registry refs verified ✓")
for lib, registry in registries.items():
install = registry.get("install", [{}])[0].get("cmd", "unknown")
print(f" {lib:20s} : {install}")
# ----------------------------------------
# KNOWN FAILURE MODES
#
# 1. Wrong import name — pymupdf package installs as `fitz` module:
# CORRECT: import fitz (after: pip install pymupdf)
# WRONG: import pymupdf (ModuleNotFoundError)
# This is the #1 pymupdf footgun — package name != import name.
#
# 2. No pydantic validation — extracting fields without validation means
# hallucinated or malformed data flows silently into downstream code.
# Always validate extracted fields through a Pydantic model.
#
# 3. Optional fields not declared — if a field doesn't exist in the PDF,
# LLM may hallucinate it. Declare as Optional[str] = None and check after.
#
# 4. Multi-page context lost — extracting page-by-page loses cross-page context.
# Concatenate all page text before extraction for documents < 10 pages.
# For larger documents, use chunked extraction with overlap.
#
# 5. Raw text extraction enough for structured data — for structured docs
# (invoices, forms, tables), PyMuPDF text extraction is often sufficient
# without an LLM. Use LLM only when layout is unpredictable.
# ----------------------------------------
# ----------------------------------------
# PYDANTIC SCHEMA
# Define the structure you want to extract.
# All uncertain fields must be Optional — never assume presence.
# ----------------------------------------
try:
from pydantic import BaseModel, field_validator
except ImportError:
pkg = registries["pydantic"]["install"][0]["cmd"].replace("pip install ", "").strip()
print(f"\nPRE_EXECUTION: pydantic not found — installing {pkg}...")
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
from pydantic import BaseModel, field_validator
class InvoiceData(BaseModel):
invoice_number: Optional[str] = None
invoice_date: Optional[str] = None
vendor_name: Optional[str] = None
vendor_email: Optional[str] = None
total_amount: Optional[float] = None
currency: Optional[str] = None
line_items: Optional[list[str]] = None
@field_validator("total_amount", mode="before")
@classmethod
def parse_amount(cls, v):
"""FM-2.6: handle string amounts like '$1,234.56' or '1234.56 USD'"""
if v is None:
return None
if isinstance(v, (int, float)):
return float(v)
if isinstance(v, str):
# Strip currency symbols and commas
cleaned = v.replace("$", "").replace(",", "").replace("USD", "").strip()
try:
return float(cleaned)
except ValueError:
return None
return None
@field_validator("currency", mode="before")
@classmethod
def normalize_currency(cls, v):
if v is None:
return None
return v.upper().strip()
# ----------------------------------------
# PDF TEXT EXTRACTION
# FM-2.6: use `import fitz` not `import pymupdf`
# ----------------------------------------
def extract_text_from_pdf(pdf_bytes: bytes) -> tuple[str, int]:
"""
Extract all text from PDF bytes.
Returns (full_text, page_count).
FM-2.6: correct import is `fitz`, not `pymupdf`
"""
try:
import fitz # FM-2.6: pymupdf installs as fitz
except ImportError:
pkg = registries["pymupdf"]["install"][0]["cmd"].replace("pip install ", "").strip()
print(f"\nEXECUTION: pymupdf not found — installing {pkg}...")
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
import fitz
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
pages_text = []
for page_num in range(len(doc)):
page = doc[page_num]
pages_text.append(page.get_text())
doc.close()
# FM-2.6: concatenate all pages for documents < 10 pages
full_text = "\n\n--- PAGE BREAK ---\n\n".join(pages_text)
return full_text, len(pages_text)
# ----------------------------------------
# MOCK LLM EXTRACTOR
# For production, replace with:
#
# from openai import OpenAI
# from instructor import from_openai
#
# client = from_openai(OpenAI())
#
# def extract_fields(text: str) -> InvoiceData:
# return client.chat.completions.create(
# model="gpt-4o-mini",
# response_model=InvoiceData,
# messages=[
# {"role": "system", "content": "Extract invoice data from the text. Return null for missing fields."},
# {"role": "user", "content": text}
# ]
# )
# ----------------------------------------
def mock_extract_fields(text: str) -> dict:
"""
Mock extraction: parses known test invoice text.
Replace with LLM call for production use.
"""
import re
result = {}
# Invoice number
m = re.search(r'Invoice\s*#?\s*:?\s*([A-Z0-9\-]+)', text, re.IGNORECASE)
result["invoice_number"] = m.group(1) if m else None
# Date
m = re.search(r'Date\s*:?\s*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})', text, re.IGNORECASE)
result["invoice_date"] = m.group(1) if m else None
# Vendor
m = re.search(r'From\s*:?\s*([A-Za-z\s]+(?:Inc|Ltd|LLC|Corp)?)', text, re.IGNORECASE)
result["vendor_name"] = m.group(1).strip() if m else None
# Email
m = re.search(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text, re.IGNORECASE)
result["vendor_email"] = m.group(0) if m else None
# Amount
m = re.search(r'Total\s*:?\s*\$?([\d,]+\.?\d{0,2})', text, re.IGNORECASE)
result["total_amount"] = m.group(1) if m else None
result["currency"] = "USD"
result["line_items"] = ["API Services - $500.00", "Support - $250.00"]
return result
# ----------------------------------------
# EXECUTION
# FM-1.1: create test PDF in-memory — idempotent, no side effects
# ----------------------------------------
try:
import fitz
except ImportError:
pkg = registries["pymupdf"]["install"][0]["cmd"].replace("pip install ", "").strip()
print(f"\nEXECUTION: pymupdf not found — installing {pkg}...")
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
import fitz
# Create test invoice PDF in-memory
print()
print("EXECUTION: creating test invoice PDF...")
TEST_INVOICE_TEXT = """
INVOICE
Invoice #: INV-2026-0042
Date: 2026-04-15
Due Date: 2026-05-15
From: Acme Software Inc
Email: billing@acme-software.com
Address: 123 Tech Street, San Francisco, CA 94105
Bill To:
checklist.day
Bengaluru, India
Line Items:
- API Services (Monthly) $500.00
- Priority Support $250.00
Subtotal: $750.00
Tax (0%): $0.00
Total: $750.00
Payment Terms: Net 30
"""
# Create PDF from text
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), TEST_INVOICE_TEXT, fontsize=11)
pdf_bytes = doc.tobytes()
doc.close()
print("EXECUTION: test PDF created ✓")
# Extract text from PDF
print("EXECUTION: extracting text from PDF...")
extracted_text, pages_processed = extract_text_from_pdf(pdf_bytes)
print(f"EXECUTION: extracted {len(extracted_text)} chars from {pages_processed} page(s) ✓")
# Extract structured fields
print("EXECUTION: extracting structured fields...")
raw_fields = mock_extract_fields(extracted_text)
print(f" raw fields: {json.dumps(raw_fields, indent=2)}")
# Validate through Pydantic — FM-3.2: validate before using
print("EXECUTION: validating extracted fields through Pydantic...")
try:
invoice = InvoiceData(**raw_fields)
validation_passed = True
print("EXECUTION: validation passed ✓")
except Exception as e:
validation_passed = False
print(f"EXECUTION: validation failed — {e}")
# Identify missing fields
missing_fields = [
field for field, value in invoice.model_dump().items()
if value is None
]
if missing_fields:
print(f" [!] missing fields: {missing_fields}")
# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify extraction produced results
# FM-3.3: exact assertions on known test invoice values
# ----------------------------------------
assert pages_processed == 1, \
f"FAIL: expected 1 page, got {pages_processed}"
assert len(extracted_text) > 0, \
"FAIL: no text extracted from PDF"
assert validation_passed, \
"FAIL: pydantic validation failed on extracted fields"
assert invoice.invoice_number == "INV-2026-0042", \
f"FAIL: expected invoice number 'INV-2026-0042', got '{invoice.invoice_number}'"
assert invoice.total_amount == 750.0, \
f"FAIL: expected total $750.00, got {invoice.total_amount}"
assert invoice.currency == "USD", \
f"FAIL: expected currency 'USD', got '{invoice.currency}'"
assert invoice.vendor_email is not None, \
"FAIL: vendor email not extracted"
print()
print(f"POST_EXECUTION: {pages_processed} page(s) processed ✓")
print(f"POST_EXECUTION: pydantic validation passed ✓")
print(f"POST_EXECUTION: invoice number verified ✓ ({invoice.invoice_number})")
print(f"POST_EXECUTION: total amount verified ✓ (${invoice.total_amount})")
print(f"POST_EXECUTION: currency verified ✓ ({invoice.currency})")
print(f"POST_EXECUTION: missing fields = {missing_fields}")
result = {
"status": "success",
"pages_processed": pages_processed,
"validation_passed": validation_passed,
"extracted_fields": invoice.model_dump(),
"missing_fields": missing_fields,
"text_chars_extracted": len(extracted_text),
}
print()
print(result)
print()
print("PASS")