{"id":"sql-natural-language-query","version":"1.0.0","primitive":"tool_calling/api","description":"Convert natural language to SQL, execute against SQLite, return structured results","registry_refs":["openai","sqlalchemy"],"tags":["sql","sqlite","natural-language","text-to-sql","nlp","structured-data","sqlalchemy"],"solves":["SQL injection via LLM-generated queries","no schema grounding causes hallucinated columns","missing result validation","destructive query execution"],"auth_required":false,"verified":true,"last_verified":"2026-04-14","next_check":"2026-07-14","eval_result":"pass","eval_env":"python3.12/linux","mast":["FM-1.1","FM-1.5","FM-2.2","FM-2.4","FM-2.6","FM-3.2","FM-3.3"],"ref":"https://arxiv.org/abs/2503.13657","inputs":[],"executable":"# ============================================\n# checklist:     sql-natural-language-query\n# version:       1.0.0\n# primitive:     tool_calling/api\n# description:   Convert natural language to SQL, execute against SQLite, return structured results\n# registry_refs: openai, sqlalchemy\n# auth_required: false\n# verified:      true\n# last_verified: 2026-04-14\n# next_check:    2026-07-14\n# eval_result:   pass\n# eval_env:      python3.12/linux\n#\n# tags:     sql, sqlite, natural-language, text-to-sql, nlp, structured-data, sqlalchemy\n# solves:   SQL injection via LLM-generated queries, no schema grounding causes hallucinated columns, missing result validation, destructive query execution\n# mast:     FM-1.1, FM-1.5, FM-2.2, FM-2.4, FM-2.6, FM-3.2, FM-3.3\n# ref:      https://arxiv.org/abs/2503.13657\n#\n# INPUTS:\n#   none — uses in-memory SQLite and mock LLM responses (no external API required)\n#\n# OUTPUTS:\n#   count_query_verified  — bool, COUNT(*) returned total=5\n#   filter_query_verified — bool, price > 50 returned 3 correct products\n#   order_query_verified  — bool, ORDER BY DESC returned \"Gadget X\" as most expensive\n# ============================================\n\nimport re\nimport sys\nimport sqlite3\nimport subprocess\nimport requests as _requests\n\n# ----------------------------------------\n# PRE_EXECUTION\n# FM-2.2: fetch ground truth for all registry_refs\n# ----------------------------------------\n\nREGISTRY_REFS = [\"openai\", \"sqlalchemy\"]\nMAX_RETRIES = 2\nregistries = {}\n\nfor lib in REGISTRY_REFS:\n    for attempt in range(MAX_RETRIES):\n        try:\n            response = _requests.get(\n                f\"https://checklist.day/api/registry/{lib}\",\n                timeout=10\n            )\n            if response.status_code == 200:\n                registries[lib] = response.json()\n                break\n        except _requests.exceptions.RequestException:\n            pass\n\nfor lib in REGISTRY_REFS:\n    assert lib in registries, \\\n        f\"ABORT: registry fetch failed for {lib} after {MAX_RETRIES} attempts\"\n\n# FM-2.4: surface breaking warnings\nfor lib, registry in registries.items():\n    breaking = [\n        w for w in registry.get(\"warnings\", [])\n        if w.get(\"severity\") == \"breaking\"\n    ]\n    if breaking:\n        print(f\"PRE_EXECUTION: {lib} has {len(breaking)} breaking warning(s):\")\n        for w in breaking:\n            print(f\"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}\")\n            print(f\"      fix: {w['fix'][:100]}\")\n\nprint()\nprint(\"PRE_EXECUTION: all registry refs verified ✓\")\n\n# ----------------------------------------\n# KNOWN FAILURE MODES\n#\n# 1. No schema grounding — LLM hallucinates column/table names not in the DB\n#    Always inject full schema into the prompt before asking for SQL\n#\n# 2. SQL injection via LLM output — LLM can be prompted to generate DROP TABLE,\n#    DELETE, UPDATE, or stacked queries. Always validate before execution.\n#\n# 3. Executing destructive queries — agents should NEVER run DDL or DML\n#    (DROP, DELETE, UPDATE, INSERT) unless explicitly permitted\n#\n# 4. No result validation — LLM says \"query will return X\" but result is empty\n#    or wrong shape. Always assert on result structure.\n#\n# 5. Missing LIMIT — LLM-generated queries without LIMIT can return millions of rows\n# ----------------------------------------\n\n# Allowlist: only these SQL statement types are permitted for NL→SQL\nALLOWED_STATEMENT_TYPES = {\"SELECT\"}\nMAX_ROWS = 1000  # FM-1.5: hard cap — never return unbounded results\n\n\ndef get_schema(conn: sqlite3.Connection) -> str:\n    \"\"\"\n    FM-2.2: always ground the LLM with the actual schema.\n    Never ask LLM to generate SQL without providing table/column definitions.\n    \"\"\"\n    cursor = conn.cursor()\n    cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table'\")\n    tables = [row[0] for row in cursor.fetchall()]\n\n    schema_parts = []\n    for table in tables:\n        cursor.execute(f\"PRAGMA table_info({table})\")\n        cols = cursor.fetchall()\n        col_defs = \", \".join(f\"{col[1]} {col[2]}\" for col in cols)\n        schema_parts.append(f\"CREATE TABLE {table} ({col_defs})\")\n\n    return \"\\n\".join(schema_parts)\n\n\ndef validate_sql(sql: str) -> str:\n    \"\"\"\n    FM-1.1: validate LLM-generated SQL before execution.\n    Strips markdown fences, checks for destructive statements.\n    \"\"\"\n    # Strip markdown code fences LLMs commonly add\n    sql = re.sub(r\"```sql\\s*\", \"\", sql, flags=re.IGNORECASE)\n    sql = re.sub(r\"```\\s*\", \"\", sql)\n    sql = sql.strip().rstrip(\";\")\n\n    # FM-2.6: only allow SELECT — reject destructive operations\n    first_word = sql.strip().split()[0].upper()\n    assert first_word in ALLOWED_STATEMENT_TYPES, \\\n        f\"ABORT: destructive SQL statement type '{first_word}' not allowed — only SELECT is permitted\"\n\n    # Reject stacked queries (semicolon injection)\n    assert \";\" not in sql, \\\n        \"ABORT: stacked queries detected — possible SQL injection in LLM output\"\n\n    # Inject LIMIT if missing — FM-1.5: never return unbounded results\n    if \"LIMIT\" not in sql.upper():\n        sql = f\"{sql} LIMIT {MAX_ROWS}\"\n\n    return sql\n\n\ndef nl_to_sql(natural_language: str, schema: str) -> str:\n    \"\"\"\n    Converts natural language question to SQL using OpenAI.\n    FM-2.2: schema always injected — LLM never guesses column names.\n\n    In production: replace with your LLM provider call.\n    This checklist uses a mock for auth_required: false execution.\n    \"\"\"\n    # --- MOCK for local execution (no API key required) ---\n    # Replace this block with a real openai call:\n    #\n    # from openai import OpenAI\n    # client = OpenAI()  # reads OPENAI_API_KEY from env\n    # response = client.chat.completions.create(\n    #     model=\"gpt-4o\",\n    #     messages=[\n    #         {\"role\": \"system\", \"content\": (\n    #             \"You are a SQL expert. Given a SQLite schema and a natural language question, \"\n    #             \"return ONLY a valid SQLite SELECT query. No explanation. No markdown. \"\n    #             \"Only use tables and columns that exist in the schema.\"\n    #         )},\n    #         {\"role\": \"user\", \"content\": f\"Schema:\\n{schema}\\n\\nQuestion: {natural_language}\"}\n    #     ],\n    #     temperature=0\n    # )\n    # return response.choices[0].message.content.strip()\n\n    # Mock: deterministic SQL for test assertion\n    mock_responses = {\n        \"how many products are there\": \"SELECT COUNT(*) as total FROM products\",\n        \"show me all products with price above 50\": \"SELECT id, name, price FROM products WHERE price > 50\",\n        \"what is the most expensive product\": \"SELECT name, price FROM products ORDER BY price DESC LIMIT 1\",\n    }\n    key = natural_language.lower().strip(\"?\")\n    if key in mock_responses:\n        return mock_responses[key]\n    return \"SELECT * FROM products\"\n\n\ndef execute_query(conn: sqlite3.Connection, sql: str) -> list[dict]:\n    \"\"\"\n    FM-2.6: use row_factory for dict results — never return raw tuples to agent.\n    FM-3.2: check result shape before returning.\n    \"\"\"\n    conn.row_factory = sqlite3.Row\n    cursor = conn.cursor()\n    cursor.execute(sql)\n    rows = cursor.fetchall()\n    return [dict(row) for row in rows]\n\n\n# ----------------------------------------\n# EXECUTION\n# Set up in-memory SQLite DB, demonstrate NL→SQL pipeline\n# FM-1.1: in-memory DB is ephemeral — idempotent on every run\n# ----------------------------------------\n\nprint()\nprint(\"EXECUTION: setting up in-memory SQLite database...\")\n\nconn = sqlite3.connect(\":memory:\")\nconn.execute(\"\"\"\n    CREATE TABLE products (\n        id      INTEGER PRIMARY KEY,\n        name    TEXT NOT NULL,\n        price   REAL NOT NULL,\n        category TEXT NOT NULL\n    )\n\"\"\")\nconn.executemany(\n    \"INSERT INTO products (name, price, category) VALUES (?, ?, ?)\",\n    [\n        (\"Widget A\", 29.99, \"hardware\"),\n        (\"Widget B\", 79.99, \"hardware\"),\n        (\"Gadget X\", 149.99, \"electronics\"),\n        (\"Gadget Y\", 9.99,  \"electronics\"),\n        (\"Tool Z\",   59.99, \"tools\"),\n    ]\n)\nconn.commit()\nprint(\"EXECUTION: database seeded with 5 products ✓\")\n\n# Ground LLM with actual schema — never skip this step\nschema = get_schema(conn)\nprint(f\"EXECUTION: schema extracted ✓\\n  {schema}\")\n\n# Run three NL queries\ntest_cases = [\n    {\n        \"question\": \"how many products are there\",\n        \"expected_key\": \"total\",\n        \"expected_value\": 5,\n    },\n    {\n        \"question\": \"show me all products with price above 50\",\n        \"expected_count\": 3,  # Widget B, Gadget X, Tool Z\n    },\n    {\n        \"question\": \"what is the most expensive product\",\n        \"expected_name\": \"Gadget X\",\n    },\n]\n\nresults_store = {}\n\nprint()\nfor tc in test_cases:\n    q = tc[\"question\"]\n    print(f\"EXECUTION: '{q}'\")\n\n    raw_sql = nl_to_sql(q, schema)\n    print(f\"  raw SQL  : {raw_sql}\")\n\n    safe_sql = validate_sql(raw_sql)\n    print(f\"  safe SQL : {safe_sql}\")\n\n    rows = execute_query(conn, safe_sql)\n    print(f\"  rows     : {rows}\")\n    results_store[q] = rows\n\nconn.close()\n\n# ----------------------------------------\n# POST_EXECUTION\n# FM-3.2: verify result shape before returning to agent\n# FM-3.3: exact match assertions\n# ----------------------------------------\n\n# Test 1: count\ncount_result = results_store[\"how many products are there\"]\nassert len(count_result) == 1, \\\n    f\"FAIL: expected 1 row for COUNT query, got {len(count_result)}\"\nassert count_result[0][\"total\"] == 5, \\\n    f\"FAIL: expected total=5, got {count_result[0]}\"\n\n# Test 2: filter\nfilter_result = results_store[\"show me all products with price above 50\"]\nassert len(filter_result) == 3, \\\n    f\"FAIL: expected 3 products above $50, got {len(filter_result)}\"\nreturned_names = {r[\"name\"] for r in filter_result}\nassert returned_names == {\"Widget B\", \"Gadget X\", \"Tool Z\"}, \\\n    f\"FAIL: wrong products returned — got {returned_names}\"\n\n# Test 3: ordering\ntop_result = results_store[\"what is the most expensive product\"]\nassert len(top_result) == 1, \\\n    f\"FAIL: expected 1 row, got {len(top_result)}\"\nassert top_result[0][\"name\"] == \"Gadget X\", \\\n    f\"FAIL: expected 'Gadget X', got '{top_result[0]['name']}'\"\n\nprint()\nprint(\"POST_EXECUTION: count query verified ✓  (total=5)\")\nprint(\"POST_EXECUTION: filter query verified ✓  (3 products above $50)\")\nprint(\"POST_EXECUTION: order query verified ✓  (most expensive = Gadget X)\")\n\nresult = {\n    \"status\": \"pass\",\n    \"count_query_verified\": True,\n    \"filter_query_verified\": True,\n    \"order_query_verified\": True,\n}\nprint(result)\nprint(\"PASS\")\n"}