local-nlp-preprocessing

code_execution · verified · python3.12/linux · json · download .py
Preprocess text locally using spaCy — tokenization, NER, POS tagging, lemmatization
uses: spacy
mast: FM-1.1, FM-1.5, FM-2.2, FM-2.4, FM-2.6, FM-3.2, FM-3.3
tags: nlp, spacy, tokenization, ner, pos-tagging, lemmatization, text-preprocessing, local
solves: wrong model name causes OSError, model not downloaded before import, using nlp() on entire corpus at once causes OOM, mixing spaCy v2 and v3 APIs, disabled pipeline components slow down inference
last verified: 2026-04-15
import sys
import subprocess
import requests as _requests

# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: fetch ground truth for registry_refs
# ----------------------------------------

REGISTRY_REFS = ["spacy"]
MAX_RETRIES = 2
registries = {}

for lib in REGISTRY_REFS:
    for attempt in range(MAX_RETRIES):
        try:
            response = _requests.get(
                f"https://checklist.day/api/registry/{lib}",
                timeout=10
            )
            if response.status_code == 200:
                registries[lib] = response.json()
                break
        except _requests.exceptions.RequestException:
            pass

for lib in REGISTRY_REFS:
    assert lib in registries, \
        f"ABORT: registry fetch failed for {lib} after {MAX_RETRIES} attempts"

# FM-2.4: surface breaking warnings
for lib, registry in registries.items():
    breaking = [
        w for w in registry.get("warnings", [])
        if w.get("severity") == "breaking"
    ]
    if breaking:
        print(f"PRE_EXECUTION: {lib} has {len(breaking)} breaking warning(s):")
        for w in breaking:
            print(f"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
            print(f"      fix: {w['fix'][:100]}")

print()
print("PRE_EXECUTION: registry refs verified ✓")
for lib, registry in registries.items():
    install = registry.get("install", [{}])[0].get("cmd", "unknown")
    print(f"  {lib:20s} : {install}")

# ----------------------------------------
# KNOWN FAILURE MODES
#
# 1. Model not downloaded — spaCy separates library install from model download.
#    `pip install spacy` does NOT download any model.
#    Must run: python -m spacy download en_core_web_sm
#    Or install directly: pip install en-core-web-sm (from spaCy releases)
#    OSError: [E050] Can't find model 'en_core_web_sm' = model not downloaded.
#
# 2. Wrong model name — spaCy v3 uses underscores in model names:
#    CORRECT:   en_core_web_sm
#    WRONG:     en_core_web_sm-3.x.x  (don't include version suffix)
#    WRONG:     en-core-web-sm        (hyphens only for pip install, not nlp.load)
#
# 3. nlp(text) on large corpus — calling nlp() in a loop is slow.
#    FM-1.5: use nlp.pipe() for batched processing — 2-5x faster.
#    nlp.pipe() is a generator — consume it, don't store raw.
#
# 4. Mixing v2 and v3 APIs — spaCy v3 removed several v2 patterns:
#    REMOVED: doc.ents as list — still works but now returns tuple
#    REMOVED: nlp.add_pipe('ner') without specifying last/first/before/after
#    REMOVED: Token.is_oov always False for vectors models
#
# 5. Unnecessary pipeline components — loading full pipeline when you only need
#    tokenization is wasteful. Disable unused components:
#    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
# ----------------------------------------

MODEL_NAME = "en_core_web_sm"
BATCH_SIZE = 32

TEXTS = [
    "Apple Inc. was founded by Steve Jobs in Cupertino, California in 1976.",
    "The transformer architecture introduced in 'Attention Is All You Need' revolutionized NLP.",
    "Anthropic released Claude 3.5 Sonnet in June 2024, achieving strong benchmark results.",
]

# ----------------------------------------
# EXECUTION
# FM-2.6: use correct spaCy v3 API patterns
# FM-1.1: nlp.pipe() is stateless — safe to run multiple times
# philosophy: auto-install and auto-download model — no asking permission
# ----------------------------------------

# Step 1: install spaCy if missing
try:
    import spacy
except ImportError:
    pkg = registries["spacy"]["install"][0]["cmd"].replace("pip install ", "").strip()
    print(f"\nEXECUTION: spacy not found — installing {pkg}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
    print(f"EXECUTION: {pkg} installed ✓")
    import spacy

print(f"\nEXECUTION: spaCy version {spacy.__version__} ✓")

# Step 2: download model if not present
# FM-2.6: use spacy.util.is_package() to check — not try/except nlp.load()
if not spacy.util.is_package(MODEL_NAME):
    print(f"EXECUTION: model '{MODEL_NAME}' not found — downloading...")
    subprocess.check_call(
        [sys.executable, "-m", "spacy", "download", MODEL_NAME]
    )
    print(f"EXECUTION: model '{MODEL_NAME}' downloaded ✓")
else:
    print(f"EXECUTION: model '{MODEL_NAME}' already installed ✓")

# Step 3: load model
# FM-2.6: load with model name string, not package import
nlp = spacy.load(MODEL_NAME)
print(f"EXECUTION: model loaded ✓  (pipeline: {nlp.pipe_names})")

# Step 4: process texts using nlp.pipe() — FM-1.5: batched, not loop
print(f"\nEXECUTION: processing {len(TEXTS)} documents via nlp.pipe()...")

docs = list(nlp.pipe(TEXTS, batch_size=BATCH_SIZE))

print(f"EXECUTION: {len(docs)} documents processed ✓")

# Step 5: extract features
tokens_per_doc = []
entities_per_doc = []
lemmas_per_doc = []
pos_counts = {}

for i, doc in enumerate(docs):
    # Tokens — exclude punctuation and whitespace for cleaner output
    tokens = [t.text for t in doc if not t.is_punct and not t.is_space]
    tokens_per_doc.append(len(tokens))

    # Named entities
    entities = [
        {"text": ent.text, "label": ent.label_, "start": ent.start_char, "end": ent.end_char}
        for ent in doc.ents
    ]
    entities_per_doc.append(entities)

    # Lemmas — exclude stopwords and punctuation
    lemmas = [t.lemma_.lower() for t in doc if not t.is_stop and not t.is_punct and not t.is_space]
    lemmas_per_doc.append(lemmas)

    # POS tag frequency
    for token in doc:
        if not token.is_space:
            pos_counts[token.pos_] = pos_counts.get(token.pos_, 0) + 1

    print(f"  doc[{i}]: {len(tokens)} tokens, {len(entities)} entities, {len(lemmas)} lemmas")
    if entities:
        for ent in entities:
            print(f"    [{ent['label']}] {ent['text']}")

top_pos_tags = dict(sorted(pos_counts.items(), key=lambda x: x[1], reverse=True)[:5])

# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify counts before asserting
# FM-3.3: exact assertions on known entities in test corpus
# ----------------------------------------

assert len(docs) == len(TEXTS), \
    f"FAIL: expected {len(TEXTS)} docs, got {len(docs)}"

assert len(tokens_per_doc) == len(TEXTS), \
    f"FAIL: token counts missing for some documents"

assert all(t > 0 for t in tokens_per_doc), \
    f"FAIL: some documents have 0 tokens — {tokens_per_doc}"

# Verify known entities from test corpus
all_entities = [e["text"] for entities in entities_per_doc for e in entities]
assert "Apple Inc." in all_entities or "Apple" in all_entities, \
    f"FAIL: expected 'Apple Inc.' in entities, got {all_entities}"

assert "Steve Jobs" in all_entities, \
    f"FAIL: expected 'Steve Jobs' in entities, got {all_entities}"

assert "Cupertino" in all_entities or "California" in all_entities, \
    f"FAIL: expected location entity, got {all_entities}"

# Verify lemmatization produced output
assert all(len(lemmas) > 0 for lemmas in lemmas_per_doc), \
    "FAIL: some documents have empty lemma lists"

# Verify POS tags populated
assert len(top_pos_tags) > 0, \
    "FAIL: no POS tags extracted"

assert "NOUN" in pos_counts or "PROPN" in pos_counts, \
    f"FAIL: expected NOUN or PROPN in POS tags, got {list(pos_counts.keys())}"

print()
print(f"POST_EXECUTION: {len(docs)} docs processed ✓")
print(f"POST_EXECUTION: tokens per doc ✓  {tokens_per_doc}")
print(f"POST_EXECUTION: entity extraction verified ✓  ({len(all_entities)} total entities)")
print(f"POST_EXECUTION: lemmatization verified ✓")
print(f"POST_EXECUTION: top POS tags ✓  {top_pos_tags}")

result = {
    "status": "pass",
    "model": MODEL_NAME,
    "docs_processed": len(docs),
    "tokens_per_doc": tokens_per_doc,
    "entities_per_doc": entities_per_doc,
    "lemmas_per_doc": lemmas_per_doc,
    "top_pos_tags": top_pos_tags,
    "all_entities": all_entities,
}
print()
print(result)
print()
print("PASS")