{"id":"local-nlp-preprocessing","version":"1.0.0","primitive":"code_execution","description":"Preprocess text locally using spaCy — tokenization, NER, POS tagging, lemmatization","registry_refs":["spacy"],"tags":["nlp","spacy","tokenization","ner","pos-tagging","lemmatization","text-preprocessing","local"],"solves":["wrong model name causes OSError","model not downloaded before import","using nlp() on entire corpus at once causes OOM","mixing spaCy v2 and v3 APIs","disabled pipeline components slow down inference"],"auth_required":false,"verified":true,"last_verified":"2026-04-15","next_check":"2026-07-15","eval_result":"pass","eval_env":"python3.12/linux","mast":["FM-1.1","FM-1.5","FM-2.2","FM-2.4","FM-2.6","FM-3.2","FM-3.3"],"ref":"https://arxiv.org/abs/2503.13657","inputs":[],"executable":"# ============================================\n# checklist:     local-nlp-preprocessing\n# version:       1.0.0\n# primitive:     code_execution\n# description:   Preprocess text locally using spaCy — tokenization, NER, POS tagging, lemmatization\n# registry_refs: spacy\n# auth_required: false\n# verified:      true\n# last_verified: 2026-04-15\n# next_check:    2026-07-15\n# eval_result:   pass\n# eval_env:      python3.12/linux\n#\n# tags:   nlp, spacy, tokenization, ner, pos-tagging, lemmatization, text-preprocessing, local\n# solves: wrong model name causes OSError, model not downloaded before import, using nlp() on entire corpus at once causes OOM, mixing spaCy v2 and v3 APIs, disabled pipeline components slow down inference\n# mast: FM-1.1, FM-1.5, FM-2.2, FM-2.4, FM-2.6, FM-3.2, FM-3.3\n# ref:  https://arxiv.org/abs/2503.13657\n#\n# INPUTS:\n#   texts           — list[str], input documents to process (default: 3 sample sentences)\n#   MODEL_NAME      — string, spaCy model to use (default: \"en_core_web_sm\")\n#   BATCH_SIZE      — int, number of texts per nlp.pipe batch (default: 32)\n#   DISABLED_PIPES  — list[str], pipeline components to disable for speed (default: [] = all enabled)\n#\n# OUTPUTS:\n#   tokens_per_doc     — list[int], token count per document\n#   entities_per_doc   — list[list[dict]], named entities per document\n#   top_pos_tags       — dict, POS tag frequency across all documents\n#   lemmas_per_doc     — list[list[str]], lemmatized tokens per document\n#   result             — dict, full structured result\n# ============================================\n\nimport sys\nimport subprocess\nimport requests as _requests\n\n# ----------------------------------------\n# PRE_EXECUTION\n# FM-2.2: fetch ground truth for registry_refs\n# ----------------------------------------\n\nREGISTRY_REFS = [\"spacy\"]\nMAX_RETRIES = 2\nregistries = {}\n\nfor lib in REGISTRY_REFS:\n    for attempt in range(MAX_RETRIES):\n        try:\n            response = _requests.get(\n                f\"https://checklist.day/api/registry/{lib}\",\n                timeout=10\n            )\n            if response.status_code == 200:\n                registries[lib] = response.json()\n                break\n        except _requests.exceptions.RequestException:\n            pass\n\nfor lib in REGISTRY_REFS:\n    assert lib in registries, \\\n        f\"ABORT: registry fetch failed for {lib} after {MAX_RETRIES} attempts\"\n\n# FM-2.4: surface breaking warnings\nfor lib, registry in registries.items():\n    breaking = [\n        w for w in registry.get(\"warnings\", [])\n        if w.get(\"severity\") == \"breaking\"\n    ]\n    if breaking:\n        print(f\"PRE_EXECUTION: {lib} has {len(breaking)} breaking warning(s):\")\n        for w in breaking:\n            print(f\"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}\")\n            print(f\"      fix: {w['fix'][:100]}\")\n\nprint()\nprint(\"PRE_EXECUTION: registry refs verified ✓\")\nfor lib, registry in registries.items():\n    install = registry.get(\"install\", [{}])[0].get(\"cmd\", \"unknown\")\n    print(f\"  {lib:20s} : {install}\")\n\n# ----------------------------------------\n# KNOWN FAILURE MODES\n#\n# 1. Model not downloaded — spaCy separates library install from model download.\n#    `pip install spacy` does NOT download any model.\n#    Must run: python -m spacy download en_core_web_sm\n#    Or install directly: pip install en-core-web-sm (from spaCy releases)\n#    OSError: [E050] Can't find model 'en_core_web_sm' = model not downloaded.\n#\n# 2. Wrong model name — spaCy v3 uses underscores in model names:\n#    CORRECT:   en_core_web_sm\n#    WRONG:     en_core_web_sm-3.x.x  (don't include version suffix)\n#    WRONG:     en-core-web-sm        (hyphens only for pip install, not nlp.load)\n#\n# 3. nlp(text) on large corpus — calling nlp() in a loop is slow.\n#    FM-1.5: use nlp.pipe() for batched processing — 2-5x faster.\n#    nlp.pipe() is a generator — consume it, don't store raw.\n#\n# 4. Mixing v2 and v3 APIs — spaCy v3 removed several v2 patterns:\n#    REMOVED: doc.ents as list — still works but now returns tuple\n#    REMOVED: nlp.add_pipe('ner') without specifying last/first/before/after\n#    REMOVED: Token.is_oov always False for vectors models\n#\n# 5. Unnecessary pipeline components — loading full pipeline when you only need\n#    tokenization is wasteful. Disable unused components:\n#    nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\", \"parser\"])\n# ----------------------------------------\n\nMODEL_NAME = \"en_core_web_sm\"\nBATCH_SIZE = 32\n\nTEXTS = [\n    \"Apple Inc. was founded by Steve Jobs in Cupertino, California in 1976.\",\n    \"The transformer architecture introduced in 'Attention Is All You Need' revolutionized NLP.\",\n    \"Anthropic released Claude 3.5 Sonnet in June 2024, achieving strong benchmark results.\",\n]\n\n# ----------------------------------------\n# EXECUTION\n# FM-2.6: use correct spaCy v3 API patterns\n# FM-1.1: nlp.pipe() is stateless — safe to run multiple times\n# philosophy: auto-install and auto-download model — no asking permission\n# ----------------------------------------\n\n# Step 1: install spaCy if missing\ntry:\n    import spacy\nexcept ImportError:\n    pkg = registries[\"spacy\"][\"install\"][0][\"cmd\"].replace(\"pip install \", \"\").strip()\n    print(f\"\\nEXECUTION: spacy not found — installing {pkg}...\")\n    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", pkg])\n    print(f\"EXECUTION: {pkg} installed ✓\")\n    import spacy\n\nprint(f\"\\nEXECUTION: spaCy version {spacy.__version__} ✓\")\n\n# Step 2: download model if not present\n# FM-2.6: use spacy.util.is_package() to check — not try/except nlp.load()\nif not spacy.util.is_package(MODEL_NAME):\n    print(f\"EXECUTION: model '{MODEL_NAME}' not found — downloading...\")\n    subprocess.check_call(\n        [sys.executable, \"-m\", \"spacy\", \"download\", MODEL_NAME]\n    )\n    print(f\"EXECUTION: model '{MODEL_NAME}' downloaded ✓\")\nelse:\n    print(f\"EXECUTION: model '{MODEL_NAME}' already installed ✓\")\n\n# Step 3: load model\n# FM-2.6: load with model name string, not package import\nnlp = spacy.load(MODEL_NAME)\nprint(f\"EXECUTION: model loaded ✓  (pipeline: {nlp.pipe_names})\")\n\n# Step 4: process texts using nlp.pipe() — FM-1.5: batched, not loop\nprint(f\"\\nEXECUTION: processing {len(TEXTS)} documents via nlp.pipe()...\")\n\ndocs = list(nlp.pipe(TEXTS, batch_size=BATCH_SIZE))\n\nprint(f\"EXECUTION: {len(docs)} documents processed ✓\")\n\n# Step 5: extract features\ntokens_per_doc = []\nentities_per_doc = []\nlemmas_per_doc = []\npos_counts = {}\n\nfor i, doc in enumerate(docs):\n    # Tokens — exclude punctuation and whitespace for cleaner output\n    tokens = [t.text for t in doc if not t.is_punct and not t.is_space]\n    tokens_per_doc.append(len(tokens))\n\n    # Named entities\n    entities = [\n        {\"text\": ent.text, \"label\": ent.label_, \"start\": ent.start_char, \"end\": ent.end_char}\n        for ent in doc.ents\n    ]\n    entities_per_doc.append(entities)\n\n    # Lemmas — exclude stopwords and punctuation\n    lemmas = [t.lemma_.lower() for t in doc if not t.is_stop and not t.is_punct and not t.is_space]\n    lemmas_per_doc.append(lemmas)\n\n    # POS tag frequency\n    for token in doc:\n        if not token.is_space:\n            pos_counts[token.pos_] = pos_counts.get(token.pos_, 0) + 1\n\n    print(f\"  doc[{i}]: {len(tokens)} tokens, {len(entities)} entities, {len(lemmas)} lemmas\")\n    if entities:\n        for ent in entities:\n            print(f\"    [{ent['label']}] {ent['text']}\")\n\ntop_pos_tags = dict(sorted(pos_counts.items(), key=lambda x: x[1], reverse=True)[:5])\n\n# ----------------------------------------\n# POST_EXECUTION\n# FM-3.2: verify counts before asserting\n# FM-3.3: exact assertions on known entities in test corpus\n# ----------------------------------------\n\nassert len(docs) == len(TEXTS), \\\n    f\"FAIL: expected {len(TEXTS)} docs, got {len(docs)}\"\n\nassert len(tokens_per_doc) == len(TEXTS), \\\n    f\"FAIL: token counts missing for some documents\"\n\nassert all(t > 0 for t in tokens_per_doc), \\\n    f\"FAIL: some documents have 0 tokens — {tokens_per_doc}\"\n\n# Verify known entities from test corpus\nall_entities = [e[\"text\"] for entities in entities_per_doc for e in entities]\nassert \"Apple Inc.\" in all_entities or \"Apple\" in all_entities, \\\n    f\"FAIL: expected 'Apple Inc.' in entities, got {all_entities}\"\n\nassert \"Steve Jobs\" in all_entities, \\\n    f\"FAIL: expected 'Steve Jobs' in entities, got {all_entities}\"\n\nassert \"Cupertino\" in all_entities or \"California\" in all_entities, \\\n    f\"FAIL: expected location entity, got {all_entities}\"\n\n# Verify lemmatization produced output\nassert all(len(lemmas) > 0 for lemmas in lemmas_per_doc), \\\n    \"FAIL: some documents have empty lemma lists\"\n\n# Verify POS tags populated\nassert len(top_pos_tags) > 0, \\\n    \"FAIL: no POS tags extracted\"\n\nassert \"NOUN\" in pos_counts or \"PROPN\" in pos_counts, \\\n    f\"FAIL: expected NOUN or PROPN in POS tags, got {list(pos_counts.keys())}\"\n\nprint()\nprint(f\"POST_EXECUTION: {len(docs)} docs processed ✓\")\nprint(f\"POST_EXECUTION: tokens per doc ✓  {tokens_per_doc}\")\nprint(f\"POST_EXECUTION: entity extraction verified ✓  ({len(all_entities)} total entities)\")\nprint(f\"POST_EXECUTION: lemmatization verified ✓\")\nprint(f\"POST_EXECUTION: top POS tags ✓  {top_pos_tags}\")\n\nresult = {\n    \"status\": \"pass\",\n    \"model\": MODEL_NAME,\n    \"docs_processed\": len(docs),\n    \"tokens_per_doc\": tokens_per_doc,\n    \"entities_per_doc\": entities_per_doc,\n    \"lemmas_per_doc\": lemmas_per_doc,\n    \"top_pos_tags\": top_pos_tags,\n    \"all_entities\": all_entities,\n}\nprint()\nprint(result)\nprint()\nprint(\"PASS\")\n"}