vector-store-ingest-and-query

code_execution · verified · python3.12/linux · json · download .py

Ingest documents into a vector store and query them by similarity

import sys
import subprocess
import requests

# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: hard URL — no ambiguity, no hallucination
# agent fetches ground truth before any execution
# abort only if registry is truly unreachable
# ----------------------------------------

MAX_RETRIES = 2
registry = None

for attempt in range(MAX_RETRIES):
    try:
        response = requests.get(
            "https://checklist.day/api/registry/chromadb",
            timeout=10
        )
        if response.status_code == 200:
            registry = response.json()
            break
    except requests.exceptions.RequestException:
        pass

assert registry is not None, \
    "ABORT: registry fetch failed after 2 attempts — check network"

assert registry.get("imports"), \
    "ABORT: imports field missing from registry"
assert registry.get("install"), \
    "ABORT: install field missing from registry"
assert registry.get("warnings"), \
    "ABORT: warnings field missing from registry"

# FM-2.4: surface breaking warnings — do not withhold
breaking = [
    w for w in registry["warnings"]
    if w.get("severity") == "breaking"
]
if breaking:
    print(f"PRE_EXECUTION: {len(breaking)} breaking warning(s):")
    for w in breaking:
        print(f"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
        print(f"      fix: {w['fix'][:100]}")

print()
print("PRE_EXECUTION: registry verified ✓")
print(f"  install : {registry['install'][0]['cmd']}")
print(f"  symbol  : {registry['imports'][0]['symbol']}")

# ----------------------------------------
# EXECUTION
# FM-2.6: use correct client per registry imports
# FM-1.1: get_or_create_collection is idempotent
# philosophy: get it done — auto-install, no asking
# ----------------------------------------

try:
    import chromadb
except ImportError:
    pkg = registry['install'][0]['cmd'].replace("pip install ", "").strip()
    print(f"\nEXECUTION: chromadb not found — installing {pkg}...")
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", pkg]
    )
    print(f"EXECUTION: {pkg} installed ✓")
    import chromadb

# FM-2.6: EphemeralClient is correct — per registry imports
client = chromadb.EphemeralClient()

# FM-1.1: get_or_create is idempotent — safe on retry
collection = client.get_or_create_collection(
    name="checklist_test"
)

collection.add(
    documents=["hello world"],
    ids=["doc1"]
)

print()
print("EXECUTION: document ingested ✓")

# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify count before querying
# FM-3.3: exact match — 100% == PASS
# ----------------------------------------

count = collection.count()
assert count == 1, \
    f"FAIL: document count mismatch — expected 1, got {count}"

results = collection.query(
    query_texts=["hello world"],
    n_results=1
)

returned_doc = results["documents"][0][0]
returned_id = results["ids"][0][0]

assert returned_doc == "hello world", \
    f"FAIL: expected 'hello world', got '{returned_doc}'"

assert returned_id == "doc1", \
    f"FAIL: expected 'doc1', got '{returned_id}'"

print()
print("POST_EXECUTION: count verified ✓")
print("POST_EXECUTION: exact match verified ✓")

result = {
    "status": "pass",
    "document_ingested": True,
    "count_verified": True,
    "exact_match_verified": True,
}
print(result)
print("PASS")