web-scraping-structured-extraction

code_execution · verified · null · json · download .py
Fetch a webpage and extract structured data fields from HTML
uses: beautifulsoup4, requests
tags: beautifulsoup4, requests, scraping, html-parsing, structured-extraction, web
solves: missing parser arg causing non-deterministic behavior, hanging requests without timeout
last verified: 2026-04-13
import sys
import subprocess
import requests as _requests

try:
    import certifi
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "certifi"])
    import certifi

# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: fetch both registries
# ----------------------------------------

MAX_RETRIES = 2

def fetch_registry(slug):
    for _ in range(MAX_RETRIES):
        try:
            r = _requests.get(
                f"https://checklist.day/api/registry/{slug}",
                timeout=10,
                verify=certifi.where()
            )
            if r.status_code == 200:
                return r.json()
        except _requests.exceptions.RequestException:
            pass
    return None

bs4_registry = fetch_registry("beautifulsoup4")
req_registry = fetch_registry("requests")

assert bs4_registry is not None, \
    "ABORT: beautifulsoup4 registry fetch failed"
assert req_registry is not None, \
    "ABORT: requests registry fetch failed"

# FM-2.4: surface breaking warnings
for slug, reg in [("beautifulsoup4", bs4_registry), ("requests", req_registry)]:
    breaking = [w for w in reg.get("warnings", []) if w.get("severity") == "breaking"]
    if breaking:
        print(f"PRE_EXECUTION: {slug} — {len(breaking)} breaking warning(s):")
        for w in breaking:
            print(f"  [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
            print(f"      fix: {w['fix'][:100]}")

print()
print("PRE_EXECUTION: registries verified ✓")
print(f"  beautifulsoup4 : {bs4_registry['install'][0]['cmd']}")
print(f"  requests       : {req_registry['install'][0]['cmd']}")

# ----------------------------------------
# EXECUTION
# FM-2.6: always set timeout on requests
# FM-1.1: always specify parser in BeautifulSoup()
#         omitting parser causes UserWarning and
#         non-deterministic behavior across environments
# philosophy: get it done — auto-install
# ----------------------------------------

def auto_install(registry):
    pkg = registry['install'][0]['cmd'].replace("pip install ", "").strip()
    print(f"\nEXECUTION: {pkg} not found — installing...")
    result = subprocess.run(
        [sys.executable, "-m", "pip", "install", pkg],
        capture_output=True
    )
    if result.returncode != 0:
        subprocess.check_call([sys.executable, "-m", "ensurepip", "--upgrade"])
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
    print(f"EXECUTION: {pkg} installed ✓")

try:
    import requests
except ImportError:
    auto_install(req_registry)
    import requests

try:
    from bs4 import BeautifulSoup
except ImportError:
    auto_install(bs4_registry)
    from bs4 import BeautifulSoup

# use example.com — stable, minimal, no auth, no rate limits
# FM-2.6: always set timeout — never leave open-ended
response = requests.get("https://example.com", timeout=10)

assert response.status_code == 200, \
    f"FAIL: expected 200, got {response.status_code}"

print("EXECUTION: page fetched ✓")

# FM-1.1: always specify parser explicitly
# "html.parser" is stdlib — no extra install needed
# "lxml" is faster but requires separate install
soup = BeautifulSoup(response.text, "html.parser")

# extract structured fields
title = soup.find("title")
h1 = soup.find("h1")
paragraphs = soup.find_all("p")
links = soup.find_all("a", href=True)

print("EXECUTION: HTML parsed ✓")

# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify each extracted field
# FM-3.3: assert content, not just existence
# ----------------------------------------

assert title is not None, \
    "FAIL: no <title> tag found"
assert title.get_text(strip=True), \
    "FAIL: title is empty"

assert h1 is not None, \
    "FAIL: no <h1> tag found"
assert h1.get_text(strip=True), \
    "FAIL: h1 is empty"

# example.com always has "Example Domain" as h1
assert "Example Domain" in h1.get_text(), \
    f"FAIL: expected 'Example Domain' in h1, got '{h1.get_text(strip=True)}'"

assert len(paragraphs) > 0, \
    "FAIL: no <p> tags found"

assert len(links) > 0, \
    "FAIL: no links found"

# verify link structure — href must be non-empty string
for link in links:
    assert link["href"], \
        f"FAIL: empty href on link: {link}"

# structured output — what an agent would return
extracted = {
    "title": title.get_text(strip=True),
    "h1": h1.get_text(strip=True),
    "paragraph_count": len(paragraphs),
    "links": [{"text": a.get_text(strip=True), "href": a["href"]} for a in links],
}

assert extracted["title"] == "Example Domain", \
    f"FAIL: title mismatch: {extracted['title']}"

print()
print("POST_EXECUTION: title verified ✓")
print("POST_EXECUTION: h1 verified ✓")
print(f"POST_EXECUTION: {extracted['paragraph_count']} paragraphs found ✓")
print(f"POST_EXECUTION: {len(extracted['links'])} links extracted ✓")

result = {
    "status": "pass",
    "page_fetched": True,
    "title_verified": True,
    "h1_verified": True,
    "paragraph_count": extracted["paragraph_count"],
    "link_count": len(extracted["links"]),
}
print(result)
print("PASS")