document-summarization

tool_calling/api · verified · python3.12/linux · json · download .py
Summarize long documents using map-reduce chunking with LLM
uses: langchain, openai, tiktoken
mast: FM-1.1, FM-1.5, FM-2.2, FM-2.4, FM-2.6, FM-3.2, FM-3.3
tags: summarization, map-reduce, chunking, langchain, openai, tiktoken, long-document, llm
solves: sending full document exceeds context window, no chunk overlap causes lost context, missing final reduce step, token counting wrong, summary of summary loses fidelity
last verified: 2026-04-15
import sys
import re
import subprocess
import requests as _requests

# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: fetch ground truth for all registry_refs
# ----------------------------------------

REGISTRY_REFS = ["langchain", "openai", "tiktoken"]
MAX_RETRIES = 2
registries = {}

for lib in REGISTRY_REFS:
    for attempt in range(MAX_RETRIES):
        try:
            response = _requests.get(
                f"https://checklist.day/api/registry/{lib}",
                timeout=10
            )
            if response.status_code == 200:
                registries[lib] = response.json()
                break
        except _requests.exceptions.RequestException:
            pass

for lib in REGISTRY_REFS:
    assert lib in registries, \
        f"ABORT: registry fetch failed for {lib} after {MAX_RETRIES} attempts"

# FM-2.4: surface breaking warnings
for lib, registry in registries.items():
    breaking = [
        w for w in registry.get("warnings", [])
        if w.get("severity") == "breaking"
    ]
    if breaking:
        print(f"PRE_EXECUTION: {lib} has {len(breaking)} breaking warning(s):")
        for w in breaking:
            print(f"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
            print(f"      fix: {w['fix'][:100]}")

print()
print("PRE_EXECUTION: all registry refs verified ✓")
for lib, registry in registries.items():
    install = registry.get("install", [{}])[0].get("cmd", "unknown")
    print(f"  {lib:20s} : {install}")

# ----------------------------------------
# KNOWN FAILURE MODES
#
# 1. Sending full document to LLM — most documents exceed context windows.
#    Always chunk first, summarize each chunk, then reduce.
#    Never call llm(entire_document) on documents > 2K tokens.
#
# 2. No chunk overlap — splitting without overlap loses context at boundaries.
#    A sentence split across chunks loses meaning in both halves.
#    Always use overlap of 10-15% of chunk size.
#
# 3. Missing reduce step — map-only summarization produces N summaries.
#    Must reduce to a single coherent summary. Without reduce, agent gets
#    a list, not an answer.
#
# 4. Wrong token counting — splitting by characters/words != splitting by tokens.
#    tiktoken gives accurate token counts per model. Always use it.
#    Rule of thumb: 1 token ≈ 4 chars, but varies significantly.
#
# 5. Summary of summary loses fidelity — over-reducing produces vague output.
#    Keep chunk summaries focused. Reduce prompt must instruct coherence.
# ----------------------------------------

CHUNK_SIZE = 800      # tokens per chunk
CHUNK_OVERLAP = 100   # token overlap between chunks
MODEL_NAME = "gpt-4o-mini"

# Sample long document for testing
SAMPLE_DOCUMENT = """
Artificial intelligence has undergone remarkable transformation over the past decade.
The introduction of transformer architectures in 2017 fundamentally changed how machines
process sequential data, enabling unprecedented performance on language tasks.

Large language models like GPT, Claude, and Gemini have demonstrated emergent capabilities
that were not explicitly programmed, including reasoning, code generation, and creative writing.
These models are trained on vast corpora of text data using self-supervised learning objectives.

The scaling hypothesis suggests that increasing model size, data, and compute leads to
predictable improvements in capability. This has driven a race among AI laboratories to
train ever-larger models. However, recent research indicates that architectural improvements
and better training data quality can achieve similar results with smaller models.

Retrieval-augmented generation (RAG) has emerged as a practical technique to ground LLM
outputs in factual information. By combining dense retrieval with generation, RAG systems
reduce hallucination and enable models to access up-to-date information beyond their
training cutoff dates.

Agent frameworks built on top of LLMs allow models to use tools, execute code, browse the web,
and interact with external APIs. These agentic systems can decompose complex tasks into
subtasks, maintain state across multiple steps, and recover from errors through retry logic.

The alignment problem remains one of the central challenges in AI safety research. Ensuring
that powerful AI systems behave in accordance with human values and intentions requires
advances in interpretability, reward modeling, and constitutional AI techniques.

Multimodal models that process text, images, audio, and video are becoming increasingly
capable. Systems like GPT-4V and Gemini Ultra can answer questions about images, generate
code from screenshots, and understand complex visual reasoning tasks.

The economic impact of AI is becoming tangible across industries. Software development,
customer service, healthcare diagnostics, and legal document review are among the sectors
seeing the most significant productivity gains from AI adoption.
""".strip()


# ----------------------------------------
# MOCK LLM SUMMARIZER
# Replace with real OpenAI call:
#
# from openai import OpenAI
# client = OpenAI()  # reads OPENAI_API_KEY from env
#
# def summarize_chunk(chunk: str) -> str:
#     response = client.chat.completions.create(
#         model=MODEL_NAME,
#         messages=[
#             {"role": "system", "content": "Summarize the following text concisely in 2-3 sentences."},
#             {"role": "user", "content": chunk}
#         ],
#         max_tokens=200,
#         temperature=0
#     )
#     return response.choices[0].message.content.strip()
#
# def reduce_summaries(summaries: list[str]) -> str:
#     combined = "\n\n".join(f"Section {i+1}: {s}" for i, s in enumerate(summaries))
#     response = client.chat.completions.create(
#         model=MODEL_NAME,
#         messages=[
#             {"role": "system", "content": "Combine these section summaries into one coherent summary."},
#             {"role": "user", "content": combined}
#         ],
#         max_tokens=400,
#         temperature=0
#     )
#     return response.choices[0].message.content.strip()
# ----------------------------------------

def mock_summarize_chunk(chunk: str) -> str:
    """Mock: returns first sentence of chunk as summary."""
    sentences = [s.strip() for s in chunk.split('.') if s.strip()]
    return sentences[0] + "." if sentences else chunk[:100]

def mock_reduce_summaries(summaries: list[str]) -> str:
    """Mock: joins all summaries into a coherent paragraph."""
    return " ".join(summaries)


def estimate_tokens(text: str) -> int:
    """
    FM-2.6: estimate tokens without tiktoken dependency.
    For production use tiktoken: len(enc.encode(text))
    Rule: 1 token ≈ 4 characters for English text.
    """
    return len(text) // 4


def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]:
    """
    Split text into overlapping token-based chunks.
    FM-1.1: deterministic — same input always produces same chunks.
    FM-2.6: use token estimates, not character counts.
    """
    # Split into sentences for cleaner boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)

    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = estimate_tokens(sentence)

        if current_tokens + sentence_tokens > chunk_size and current_chunk:
            # Save current chunk
            chunks.append(" ".join(current_chunk))

            # Keep overlap — retain last N tokens worth of sentences
            overlap_sentences = []
            overlap_tokens = 0
            for s in reversed(current_chunk):
                t = estimate_tokens(s)
                if overlap_tokens + t <= overlap:
                    overlap_sentences.insert(0, s)
                    overlap_tokens += t
                else:
                    break

            current_chunk = overlap_sentences
            current_tokens = overlap_tokens

        current_chunk.append(sentence)
        current_tokens += sentence_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


# ----------------------------------------
# EXECUTION
# FM-1.5: process finite number of chunks — no unbounded loops
# FM-1.1: chunking is deterministic and idempotent
# ----------------------------------------

print()
print(f"EXECUTION: document length = {len(SAMPLE_DOCUMENT)} chars "
      f"(~{estimate_tokens(SAMPLE_DOCUMENT)} tokens)")

# Step 1: chunk document
chunks = chunk_text(SAMPLE_DOCUMENT, CHUNK_SIZE, CHUNK_OVERLAP)
print(f"EXECUTION: split into {len(chunks)} chunks ✓")
for i, chunk in enumerate(chunks):
    print(f"  chunk[{i}]: ~{estimate_tokens(chunk)} tokens")

# Step 2: MAP — summarize each chunk
print(f"\nEXECUTION: MAP — summarizing {len(chunks)} chunks...")
chunk_summaries = []
for i, chunk in enumerate(chunks):
    summary = mock_summarize_chunk(chunk)
    chunk_summaries.append(summary)
    print(f"  chunk[{i}] summary: {summary[:80]}...")

# Step 3: REDUCE — combine chunk summaries into final summary
print(f"\nEXECUTION: REDUCE — combining {len(chunk_summaries)} summaries...")
final_summary = mock_reduce_summaries(chunk_summaries)
print(f"  final summary: {final_summary[:120]}...")

total_tokens_est = estimate_tokens(SAMPLE_DOCUMENT)

# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify all chunks were summarized
# FM-3.3: exact assertions on structure
# ----------------------------------------

assert len(chunks) >= 1, \
    "FAIL: document produced no chunks"

assert len(chunk_summaries) == len(chunks), \
    f"FAIL: expected {len(chunks)} summaries, got {len(chunk_summaries)}"

assert all(len(s) > 0 for s in chunk_summaries), \
    "FAIL: some chunk summaries are empty"

assert len(final_summary) > 0, \
    "FAIL: final summary is empty"

assert len(final_summary) < len(SAMPLE_DOCUMENT), \
    "FAIL: final summary is longer than original document — summarization failed"

assert total_tokens_est > 0, \
    "FAIL: token estimation failed"

# Verify overlap is working — chunks should share boundary content
if len(chunks) > 1:
    # Last sentence of chunk[0] should appear in chunk[1] (overlap)
    chunk0_sentences = chunks[0].split('.')
    last_sentence_chunk0 = chunk0_sentences[-2].strip() if len(chunk0_sentences) > 1 else ""
    # Overlap check is approximate — just verify chunks are non-empty and distinct
    assert chunks[0] != chunks[1], \
        "FAIL: first two chunks are identical — chunking broken"

print()
print(f"POST_EXECUTION: {len(chunks)} chunks created ✓")
print(f"POST_EXECUTION: {len(chunk_summaries)} chunk summaries generated ✓")
print(f"POST_EXECUTION: final summary produced ✓  ({len(final_summary)} chars)")
print(f"POST_EXECUTION: compression ratio = {len(final_summary)/len(SAMPLE_DOCUMENT):.2%} ✓")
print(f"POST_EXECUTION: summary shorter than original ✓")

result = {
    "status": "success",
    "model": MODEL_NAME,
    "document_chars": len(SAMPLE_DOCUMENT),
    "total_tokens_est": total_tokens_est,
    "chunks_created": len(chunks),
    "chunk_size_tokens": CHUNK_SIZE,
    "chunk_overlap_tokens": CHUNK_OVERLAP,
    "chunk_summaries": chunk_summaries,
    "final_summary": final_summary,
    "compression_ratio": round(len(final_summary) / len(SAMPLE_DOCUMENT), 4),
}
print()
print(result)
print()
print("PASS")