semantic-similarity-search

code_execution · verified · null · json · download .py
Embed documents and find most similar ones using cosine search
uses: sentence-transformers, faiss-cpu
tags: embeddings, faiss, sentence-transformers, vector-search, cosine-similarity, nlp
solves: wrong faiss index type for cosine, float64 vs float32 dtype errors, embedding shape mismatch
last verified: 2026-04-13
import sys
import subprocess
import requests

# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: hard URLs — fetch both registries
# abort if either is unreachable
# ----------------------------------------

MAX_RETRIES = 2

def fetch_registry(slug):
    for attempt in range(MAX_RETRIES):
        try:
            r = requests.get(
                f"https://checklist.day/api/registry/{slug}",
                timeout=10
            )
            if r.status_code == 200:
                return r.json()
        except requests.exceptions.RequestException:
            pass
    return None

st_registry = fetch_registry("sentence-transformers")
faiss_registry = fetch_registry("faiss-cpu")

assert st_registry is not None, \
    "ABORT: sentence-transformers registry fetch failed"
assert faiss_registry is not None, \
    "ABORT: faiss-cpu registry fetch failed"

assert st_registry.get("imports"), \
    "ABORT: imports missing from sentence-transformers registry"
assert faiss_registry.get("imports"), \
    "ABORT: imports missing from faiss-cpu registry"

# FM-2.4: surface breaking warnings
for slug, reg in [("sentence-transformers", st_registry), ("faiss-cpu", faiss_registry)]:
    breaking = [w for w in reg.get("warnings", []) if w.get("severity") == "breaking"]
    if breaking:
        print(f"PRE_EXECUTION: {slug} — {len(breaking)} breaking warning(s):")
        for w in breaking:
            print(f"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
            print(f"      fix: {w['fix'][:100]}")

print()
print("PRE_EXECUTION: both registries verified ✓")
print(f"  sentence-transformers : {st_registry['install'][0]['cmd']}")
print(f"  faiss-cpu             : {faiss_registry['install'][0]['cmd']}")

# ----------------------------------------
# EXECUTION
# FM-2.6: use correct import patterns per registry
# philosophy: get it done — auto-install both
# ----------------------------------------

def auto_install(registry):
    pkg = registry['install'][0]['cmd'].replace("pip install ", "").strip()
    print(f"\nEXECUTION: {pkg} not found — installing...")
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", pkg],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )
    print(f"EXECUTION: {pkg} installed ✓")

try:
    from sentence_transformers import SentenceTransformer
except ImportError:
    auto_install(st_registry)
    from sentence_transformers import SentenceTransformer

try:
    import faiss
except ImportError:
    auto_install(faiss_registry)
    import faiss

import numpy as np

# FM-2.6: correct model load — per registry imports
model = SentenceTransformer("all-MiniLM-L6-v2")

# hello world documents
documents = [
    "hello world",
    "machine learning is great",
    "python is a programming language",
    "neural networks learn from data",
]

print()
print("EXECUTION: encoding documents...")
embeddings = model.encode(documents, normalize_embeddings=True)

# FM-3.2: verify embedding shape before indexing
assert embeddings.shape[0] == len(documents), \
    f"FAIL: expected {len(documents)} embeddings, got {embeddings.shape[0]}"
assert embeddings.shape[1] > 0, \
    "FAIL: embedding dimension is 0"

dim = embeddings.shape[1]
print(f"EXECUTION: embeddings shape {embeddings.shape} ✓")

# FM-2.6: IndexFlatIP for cosine similarity (normalized vectors)
# do NOT use IndexFlatL2 for cosine — common mistake
index = faiss.IndexFlatIP(dim)
index.add(embeddings.astype(np.float32))

print(f"EXECUTION: faiss index built, {index.ntotal} vectors ✓")

# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify index has correct count
# FM-3.3: query "hello world" must return itself as top result
# ----------------------------------------

assert index.ntotal == len(documents), \
    f"FAIL: index has {index.ntotal} vectors, expected {len(documents)}"

# query with the first document — must return itself
query = model.encode(["hello world"], normalize_embeddings=True)
distances, indices = index.search(query.astype(np.float32), k=1)

top_index = indices[0][0]
top_distance = distances[0][0]
top_doc = documents[top_index]

assert top_doc == "hello world", \
    f"FAIL: expected 'hello world' as top result, got '{top_doc}'"

assert top_distance > 0.99, \
    f"FAIL: expected cosine similarity ~1.0, got {top_distance:.4f}"

print()
print("POST_EXECUTION: index count verified ✓")
print(f"POST_EXECUTION: top result '{top_doc}' (similarity: {top_distance:.4f}) ✓")

result = {
    "status": "pass",
    "embedding_shape_verified": True,
    "index_count_verified": True,
    "top_result_correct": True,
    "top_similarity_score": float(top_distance),
}
print(result)
print("PASS")