semantic-similarity-search
Embed documents and find most similar ones using cosine search
import sys
import subprocess
import requests
# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: hard URLs — fetch both registries
# abort if either is unreachable
# ----------------------------------------
MAX_RETRIES = 2
def fetch_registry(slug):
for attempt in range(MAX_RETRIES):
try:
r = requests.get(
f"https://checklist.day/api/registry/{slug}",
timeout=10
)
if r.status_code == 200:
return r.json()
except requests.exceptions.RequestException:
pass
return None
st_registry = fetch_registry("sentence-transformers")
faiss_registry = fetch_registry("faiss-cpu")
assert st_registry is not None, \
"ABORT: sentence-transformers registry fetch failed"
assert faiss_registry is not None, \
"ABORT: faiss-cpu registry fetch failed"
assert st_registry.get("imports"), \
"ABORT: imports missing from sentence-transformers registry"
assert faiss_registry.get("imports"), \
"ABORT: imports missing from faiss-cpu registry"
# FM-2.4: surface breaking warnings
for slug, reg in [("sentence-transformers", st_registry), ("faiss-cpu", faiss_registry)]:
breaking = [w for w in reg.get("warnings", []) if w.get("severity") == "breaking"]
if breaking:
print(f"PRE_EXECUTION: {slug} — {len(breaking)} breaking warning(s):")
for w in breaking:
print(f" [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
print(f" fix: {w['fix'][:100]}")
print()
print("PRE_EXECUTION: both registries verified ✓")
print(f" sentence-transformers : {st_registry['install'][0]['cmd']}")
print(f" faiss-cpu : {faiss_registry['install'][0]['cmd']}")
# ----------------------------------------
# EXECUTION
# FM-2.6: use correct import patterns per registry
# philosophy: get it done — auto-install both
# ----------------------------------------
def auto_install(registry):
pkg = registry['install'][0]['cmd'].replace("pip install ", "").strip()
print(f"\nEXECUTION: {pkg} not found — installing...")
subprocess.check_call(
[sys.executable, "-m", "pip", "install", pkg],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
print(f"EXECUTION: {pkg} installed ✓")
try:
from sentence_transformers import SentenceTransformer
except ImportError:
auto_install(st_registry)
from sentence_transformers import SentenceTransformer
try:
import faiss
except ImportError:
auto_install(faiss_registry)
import faiss
import numpy as np
# FM-2.6: correct model load — per registry imports
model = SentenceTransformer("all-MiniLM-L6-v2")
# hello world documents
documents = [
"hello world",
"machine learning is great",
"python is a programming language",
"neural networks learn from data",
]
print()
print("EXECUTION: encoding documents...")
embeddings = model.encode(documents, normalize_embeddings=True)
# FM-3.2: verify embedding shape before indexing
assert embeddings.shape[0] == len(documents), \
f"FAIL: expected {len(documents)} embeddings, got {embeddings.shape[0]}"
assert embeddings.shape[1] > 0, \
"FAIL: embedding dimension is 0"
dim = embeddings.shape[1]
print(f"EXECUTION: embeddings shape {embeddings.shape} ✓")
# FM-2.6: IndexFlatIP for cosine similarity (normalized vectors)
# do NOT use IndexFlatL2 for cosine — common mistake
index = faiss.IndexFlatIP(dim)
index.add(embeddings.astype(np.float32))
print(f"EXECUTION: faiss index built, {index.ntotal} vectors ✓")
# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify index has correct count
# FM-3.3: query "hello world" must return itself as top result
# ----------------------------------------
assert index.ntotal == len(documents), \
f"FAIL: index has {index.ntotal} vectors, expected {len(documents)}"
# query with the first document — must return itself
query = model.encode(["hello world"], normalize_embeddings=True)
distances, indices = index.search(query.astype(np.float32), k=1)
top_index = indices[0][0]
top_distance = distances[0][0]
top_doc = documents[top_index]
assert top_doc == "hello world", \
f"FAIL: expected 'hello world' as top result, got '{top_doc}'"
assert top_distance > 0.99, \
f"FAIL: expected cosine similarity ~1.0, got {top_distance:.4f}"
print()
print("POST_EXECUTION: index count verified ✓")
print(f"POST_EXECUTION: top result '{top_doc}' (similarity: {top_distance:.4f}) ✓")
result = {
"status": "pass",
"embedding_shape_verified": True,
"index_count_verified": True,
"top_result_correct": True,
"top_similarity_score": float(top_distance),
}
print(result)
print("PASS")