{"id":"semantic-similarity-search","version":"1.0.0","primitive":"code_execution","description":"Embed documents and find most similar ones using cosine search","registry_refs":["sentence-transformers","faiss-cpu"],"tags":["embeddings","faiss","sentence-transformers","vector-search","cosine-similarity","nlp"],"solves":["wrong faiss index type for cosine","float64 vs float32 dtype errors","embedding shape mismatch"],"auth_required":false,"verified":true,"last_verified":"2026-04-13","next_check":"2026-07-13","eval_result":"null","eval_env":"null","mast":[],"ref":"https://arxiv.org/abs/2503.13657","executable":"# ============================================\n# checklist:     semantic-similarity-search\n# version:       1.0.0\n# primitive:     code_execution\n# description:   Embed documents and find most similar ones using cosine search\n# registry_refs: sentence-transformers, faiss-cpu\n# auth_required: false\n# verified:      true\n# last_verified: 2026-04-13\n# next_check:    2026-07-13\n# eval_result:   null\n# eval_env:      null\n#\n# MAST FAILURE MODES ADDRESSED:\n# FM-1.1 Disobey Task Specification    — correct model load pattern enforced from registry\n# FM-2.2 Fail to Ask for Clarification — hard URLs, no assumptions about install/import\n# FM-2.6 Reasoning-Action Mismatch     — correct faiss index type (IndexFlatIP) enforced\n# FM-3.2 No or Incomplete Verification — embedding shape verified before indexing\n# FM-3.3 Incorrect Verification        — top result must be exact match (distance == 1.0)\n#\n# tags:   embeddings, faiss, sentence-transformers, vector-search, cosine-similarity, nlp\n# solves: wrong faiss index type for cosine, float64 vs float32 dtype errors, embedding shape mismatch\n# ref: https://arxiv.org/abs/2503.13657\n#\n# INPUTS:\n#   EMBEDDING_MODEL — string, sentence-transformers model (default: \"all-MiniLM-L6-v2\")\n#   QUERY           — string, query to find most similar doc (default: \"hello world\")\n#\n# OUTPUTS:\n#   embedding_shape_verified — bool, embeddings shape matches document count and dim > 0\n#   index_count_verified     — bool, faiss index contains all documents\n#   top_result_correct       — bool, query returned exact self-match\n#   top_similarity_score     — float, cosine similarity of top result (expected > 0.99)\n# ============================================\n\nimport sys\nimport subprocess\nimport requests\n\n# ----------------------------------------\n# PRE_EXECUTION\n# FM-2.2: hard URLs — fetch both registries\n# abort if either is unreachable\n# ----------------------------------------\n\nMAX_RETRIES = 2\n\ndef fetch_registry(slug):\n    for attempt in range(MAX_RETRIES):\n        try:\n            r = requests.get(\n                f\"https://checklist.day/api/registry/{slug}\",\n                timeout=10\n            )\n            if r.status_code == 200:\n                return r.json()\n        except requests.exceptions.RequestException:\n            pass\n    return None\n\nst_registry = fetch_registry(\"sentence-transformers\")\nfaiss_registry = fetch_registry(\"faiss-cpu\")\n\nassert st_registry is not None, \\\n    \"ABORT: sentence-transformers registry fetch failed\"\nassert faiss_registry is not None, \\\n    \"ABORT: faiss-cpu registry fetch failed\"\n\nassert st_registry.get(\"imports\"), \\\n    \"ABORT: imports missing from sentence-transformers registry\"\nassert faiss_registry.get(\"imports\"), \\\n    \"ABORT: imports missing from faiss-cpu registry\"\n\n# FM-2.4: surface breaking warnings\nfor slug, reg in [(\"sentence-transformers\", st_registry), (\"faiss-cpu\", faiss_registry)]:\n    breaking = [w for w in reg.get(\"warnings\", []) if w.get(\"severity\") == \"breaking\"]\n    if breaking:\n        print(f\"PRE_EXECUTION: {slug} — {len(breaking)} breaking warning(s):\")\n        for w in breaking:\n            print(f\"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}\")\n            print(f\"      fix: {w['fix'][:100]}\")\n\nprint()\nprint(\"PRE_EXECUTION: both registries verified ✓\")\nprint(f\"  sentence-transformers : {st_registry['install'][0]['cmd']}\")\nprint(f\"  faiss-cpu             : {faiss_registry['install'][0]['cmd']}\")\n\n# ----------------------------------------\n# EXECUTION\n# FM-2.6: use correct import patterns per registry\n# philosophy: get it done — auto-install both\n# ----------------------------------------\n\ndef auto_install(registry):\n    pkg = registry['install'][0]['cmd'].replace(\"pip install \", \"\").strip()\n    print(f\"\\nEXECUTION: {pkg} not found — installing...\")\n    subprocess.check_call(\n        [sys.executable, \"-m\", \"pip\", \"install\", pkg],\n        stdout=subprocess.DEVNULL,\n        stderr=subprocess.DEVNULL\n    )\n    print(f\"EXECUTION: {pkg} installed ✓\")\n\ntry:\n    from sentence_transformers import SentenceTransformer\nexcept ImportError:\n    auto_install(st_registry)\n    from sentence_transformers import SentenceTransformer\n\ntry:\n    import faiss\nexcept ImportError:\n    auto_install(faiss_registry)\n    import faiss\n\nimport numpy as np\n\n# FM-2.6: correct model load — per registry imports\nmodel = SentenceTransformer(\"all-MiniLM-L6-v2\")\n\n# hello world documents\ndocuments = [\n    \"hello world\",\n    \"machine learning is great\",\n    \"python is a programming language\",\n    \"neural networks learn from data\",\n]\n\nprint()\nprint(\"EXECUTION: encoding documents...\")\nembeddings = model.encode(documents, normalize_embeddings=True)\n\n# FM-3.2: verify embedding shape before indexing\nassert embeddings.shape[0] == len(documents), \\\n    f\"FAIL: expected {len(documents)} embeddings, got {embeddings.shape[0]}\"\nassert embeddings.shape[1] > 0, \\\n    \"FAIL: embedding dimension is 0\"\n\ndim = embeddings.shape[1]\nprint(f\"EXECUTION: embeddings shape {embeddings.shape} ✓\")\n\n# FM-2.6: IndexFlatIP for cosine similarity (normalized vectors)\n# do NOT use IndexFlatL2 for cosine — common mistake\nindex = faiss.IndexFlatIP(dim)\nindex.add(embeddings.astype(np.float32))\n\nprint(f\"EXECUTION: faiss index built, {index.ntotal} vectors ✓\")\n\n# ----------------------------------------\n# POST_EXECUTION\n# FM-3.2: verify index has correct count\n# FM-3.3: query \"hello world\" must return itself as top result\n# ----------------------------------------\n\nassert index.ntotal == len(documents), \\\n    f\"FAIL: index has {index.ntotal} vectors, expected {len(documents)}\"\n\n# query with the first document — must return itself\nquery = model.encode([\"hello world\"], normalize_embeddings=True)\ndistances, indices = index.search(query.astype(np.float32), k=1)\n\ntop_index = indices[0][0]\ntop_distance = distances[0][0]\ntop_doc = documents[top_index]\n\nassert top_doc == \"hello world\", \\\n    f\"FAIL: expected 'hello world' as top result, got '{top_doc}'\"\n\nassert top_distance > 0.99, \\\n    f\"FAIL: expected cosine similarity ~1.0, got {top_distance:.4f}\"\n\nprint()\nprint(\"POST_EXECUTION: index count verified ✓\")\nprint(f\"POST_EXECUTION: top result '{top_doc}' (similarity: {top_distance:.4f}) ✓\")\n\nresult = {\n    \"status\": \"pass\",\n    \"embedding_shape_verified\": True,\n    \"index_count_verified\": True,\n    \"top_result_correct\": True,\n    \"top_similarity_score\": float(top_distance),\n}\nprint(result)\nprint(\"PASS\")\n"}