{"id":"web-scraping-structured-extraction","version":"1.0.0","primitive":"code_execution","description":"Fetch a webpage and extract structured data fields from HTML","registry_refs":["beautifulsoup4","requests"],"tags":["beautifulsoup4","requests","scraping","html-parsing","structured-extraction","web"],"solves":["missing parser arg causing non-deterministic behavior","hanging requests without timeout"],"auth_required":false,"verified":true,"last_verified":"2026-04-13","next_check":"2026-07-13","eval_result":"null","eval_env":"null","mast":[],"ref":"https://arxiv.org/abs/2503.13657","inputs":[],"executable":"# ============================================\n# checklist:     web-scraping-structured-extraction\n# version:       1.0.0\n# primitive:     code_execution\n# description:   Fetch a webpage and extract structured data fields from HTML\n# registry_refs: beautifulsoup4, requests\n# auth_required: false\n# verified:      true\n# last_verified: 2026-04-13\n# next_check:    2026-07-13\n# eval_result:   null\n# eval_env:      null\n#\n# MAST FAILURE MODES ADDRESSED:\n# FM-1.1 Disobey Task Specification    — explicit parser arg prevents BS4 warning/fallback\n# FM-2.2 Fail to Ask for Clarification — hard registry URLs, no guessing install names\n# FM-2.6 Reasoning-Action Mismatch     — requests timeout always set, no hanging calls\n# FM-3.2 No or Incomplete Verification — extracted fields validated, not just \"got response\"\n# FM-3.3 Incorrect Verification        — assert content, not just type\n#\n# tags:   beautifulsoup4, requests, scraping, html-parsing, structured-extraction, web\n# solves: missing parser arg causing non-deterministic behavior, hanging requests without timeout\n# ref: https://arxiv.org/abs/2503.13657\n#\n# INPUTS:\n#   TEST_URL — string, URL to scrape (default: \"https://example.com\")\n#\n# OUTPUTS:\n#   page_fetched    — bool, HTTP 200 received\n#   title_verified  — bool, <title> matches \"Example Domain\"\n#   h1_verified     — bool, <h1> matches \"Example Domain\"\n#   paragraph_count — int, number of <p> tags found\n#   link_count      — int, number of <a href> tags found\n# ============================================\n\nimport sys\nimport subprocess\nimport requests as _requests\n\ntry:\n    import certifi\nexcept ImportError:\n    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"certifi\"])\n    import certifi\n\n# ----------------------------------------\n# PRE_EXECUTION\n# FM-2.2: fetch both registries\n# ----------------------------------------\n\nMAX_RETRIES = 2\n\ndef fetch_registry(slug):\n    for _ in range(MAX_RETRIES):\n        try:\n            r = _requests.get(\n                f\"https://checklist.day/api/registry/{slug}\",\n                timeout=10,\n                verify=certifi.where()\n            )\n            if r.status_code == 200:\n                return r.json()\n        except _requests.exceptions.RequestException:\n            pass\n    return None\n\nbs4_registry = fetch_registry(\"beautifulsoup4\")\nreq_registry = fetch_registry(\"requests\")\n\nassert bs4_registry is not None, \\\n    \"ABORT: beautifulsoup4 registry fetch failed\"\nassert req_registry is not None, \\\n    \"ABORT: requests registry fetch failed\"\n\n# FM-2.4: surface breaking warnings\nfor slug, reg in [(\"beautifulsoup4\", bs4_registry), (\"requests\", req_registry)]:\n    breaking = [w for w in reg.get(\"warnings\", []) if w.get(\"severity\") == \"breaking\"]\n    if breaking:\n        print(f\"PRE_EXECUTION: {slug} — {len(breaking)} breaking warning(s):\")\n        for w in breaking:\n            print(f\"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}\")\n            print(f\"      fix: {w['fix'][:100]}\")\n\nprint()\nprint(\"PRE_EXECUTION: registries verified ✓\")\nprint(f\"  beautifulsoup4 : {bs4_registry['install'][0]['cmd']}\")\nprint(f\"  requests       : {req_registry['install'][0]['cmd']}\")\n\n# ----------------------------------------\n# EXECUTION\n# FM-2.6: always set timeout on requests\n# FM-1.1: always specify parser in BeautifulSoup()\n#         omitting parser causes UserWarning and\n#         non-deterministic behavior across environments\n# philosophy: get it done — auto-install\n# ----------------------------------------\n\ndef auto_install(registry):\n    pkg = registry['install'][0]['cmd'].replace(\"pip install \", \"\").strip()\n    print(f\"\\nEXECUTION: {pkg} not found — installing...\")\n    result = subprocess.run(\n        [sys.executable, \"-m\", \"pip\", \"install\", pkg],\n        capture_output=True\n    )\n    if result.returncode != 0:\n        subprocess.check_call([sys.executable, \"-m\", \"ensurepip\", \"--upgrade\"])\n        subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", pkg])\n    print(f\"EXECUTION: {pkg} installed ✓\")\n\ntry:\n    import requests\nexcept ImportError:\n    auto_install(req_registry)\n    import requests\n\ntry:\n    from bs4 import BeautifulSoup\nexcept ImportError:\n    auto_install(bs4_registry)\n    from bs4 import BeautifulSoup\n\n# use example.com — stable, minimal, no auth, no rate limits\n# FM-2.6: always set timeout — never leave open-ended\nresponse = requests.get(\"https://example.com\", timeout=10)\n\nassert response.status_code == 200, \\\n    f\"FAIL: expected 200, got {response.status_code}\"\n\nprint(\"EXECUTION: page fetched ✓\")\n\n# FM-1.1: always specify parser explicitly\n# \"html.parser\" is stdlib — no extra install needed\n# \"lxml\" is faster but requires separate install\nsoup = BeautifulSoup(response.text, \"html.parser\")\n\n# extract structured fields\ntitle = soup.find(\"title\")\nh1 = soup.find(\"h1\")\nparagraphs = soup.find_all(\"p\")\nlinks = soup.find_all(\"a\", href=True)\n\nprint(\"EXECUTION: HTML parsed ✓\")\n\n# ----------------------------------------\n# POST_EXECUTION\n# FM-3.2: verify each extracted field\n# FM-3.3: assert content, not just existence\n# ----------------------------------------\n\nassert title is not None, \\\n    \"FAIL: no <title> tag found\"\nassert title.get_text(strip=True), \\\n    \"FAIL: title is empty\"\n\nassert h1 is not None, \\\n    \"FAIL: no <h1> tag found\"\nassert h1.get_text(strip=True), \\\n    \"FAIL: h1 is empty\"\n\n# example.com always has \"Example Domain\" as h1\nassert \"Example Domain\" in h1.get_text(), \\\n    f\"FAIL: expected 'Example Domain' in h1, got '{h1.get_text(strip=True)}'\"\n\nassert len(paragraphs) > 0, \\\n    \"FAIL: no <p> tags found\"\n\nassert len(links) > 0, \\\n    \"FAIL: no links found\"\n\n# verify link structure — href must be non-empty string\nfor link in links:\n    assert link[\"href\"], \\\n        f\"FAIL: empty href on link: {link}\"\n\n# structured output — what an agent would return\nextracted = {\n    \"title\": title.get_text(strip=True),\n    \"h1\": h1.get_text(strip=True),\n    \"paragraph_count\": len(paragraphs),\n    \"links\": [{\"text\": a.get_text(strip=True), \"href\": a[\"href\"]} for a in links],\n}\n\nassert extracted[\"title\"] == \"Example Domain\", \\\n    f\"FAIL: title mismatch: {extracted['title']}\"\n\nprint()\nprint(\"POST_EXECUTION: title verified ✓\")\nprint(\"POST_EXECUTION: h1 verified ✓\")\nprint(f\"POST_EXECUTION: {extracted['paragraph_count']} paragraphs found ✓\")\nprint(f\"POST_EXECUTION: {len(extracted['links'])} links extracted ✓\")\n\nresult = {\n    \"status\": \"pass\",\n    \"page_fetched\": True,\n    \"title_verified\": True,\n    \"h1_verified\": True,\n    \"paragraph_count\": extracted[\"paragraph_count\"],\n    \"link_count\": len(extracted[\"links\"]),\n}\nprint(result)\nprint(\"PASS\")\n"}