web-scraping-structured-extraction
Fetch a webpage and extract structured data fields from HTML
import sys
import subprocess
import requests as _requests
try:
import certifi
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip", "install", "certifi"])
import certifi
# ----------------------------------------
# PRE_EXECUTION
# FM-2.2: fetch both registries
# ----------------------------------------
MAX_RETRIES = 2
def fetch_registry(slug):
for _ in range(MAX_RETRIES):
try:
r = _requests.get(
f"https://checklist.day/api/registry/{slug}",
timeout=10,
verify=certifi.where()
)
if r.status_code == 200:
return r.json()
except _requests.exceptions.RequestException:
pass
return None
bs4_registry = fetch_registry("beautifulsoup4")
req_registry = fetch_registry("requests")
assert bs4_registry is not None, \
"ABORT: beautifulsoup4 registry fetch failed"
assert req_registry is not None, \
"ABORT: requests registry fetch failed"
# FM-2.4: surface breaking warnings
for slug, reg in [("beautifulsoup4", bs4_registry), ("requests", req_registry)]:
breaking = [w for w in reg.get("warnings", []) if w.get("severity") == "breaking"]
if breaking:
print(f"PRE_EXECUTION: {slug} — {len(breaking)} breaking warning(s):")
for w in breaking:
print(f" [!] [{w.get('affected_versions', 'all')}] {w['message'][:120]}")
print(f" fix: {w['fix'][:100]}")
print()
print("PRE_EXECUTION: registries verified ✓")
print(f" beautifulsoup4 : {bs4_registry['install'][0]['cmd']}")
print(f" requests : {req_registry['install'][0]['cmd']}")
# ----------------------------------------
# EXECUTION
# FM-2.6: always set timeout on requests
# FM-1.1: always specify parser in BeautifulSoup()
# omitting parser causes UserWarning and
# non-deterministic behavior across environments
# philosophy: get it done — auto-install
# ----------------------------------------
def auto_install(registry):
pkg = registry['install'][0]['cmd'].replace("pip install ", "").strip()
print(f"\nEXECUTION: {pkg} not found — installing...")
result = subprocess.run(
[sys.executable, "-m", "pip", "install", pkg],
capture_output=True
)
if result.returncode != 0:
subprocess.check_call([sys.executable, "-m", "ensurepip", "--upgrade"])
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
print(f"EXECUTION: {pkg} installed ✓")
try:
import requests
except ImportError:
auto_install(req_registry)
import requests
try:
from bs4 import BeautifulSoup
except ImportError:
auto_install(bs4_registry)
from bs4 import BeautifulSoup
# use example.com — stable, minimal, no auth, no rate limits
# FM-2.6: always set timeout — never leave open-ended
response = requests.get("https://example.com", timeout=10)
assert response.status_code == 200, \
f"FAIL: expected 200, got {response.status_code}"
print("EXECUTION: page fetched ✓")
# FM-1.1: always specify parser explicitly
# "html.parser" is stdlib — no extra install needed
# "lxml" is faster but requires separate install
soup = BeautifulSoup(response.text, "html.parser")
# extract structured fields
title = soup.find("title")
h1 = soup.find("h1")
paragraphs = soup.find_all("p")
links = soup.find_all("a", href=True)
print("EXECUTION: HTML parsed ✓")
# ----------------------------------------
# POST_EXECUTION
# FM-3.2: verify each extracted field
# FM-3.3: assert content, not just existence
# ----------------------------------------
assert title is not None, \
"FAIL: no <title> tag found"
assert title.get_text(strip=True), \
"FAIL: title is empty"
assert h1 is not None, \
"FAIL: no <h1> tag found"
assert h1.get_text(strip=True), \
"FAIL: h1 is empty"
# example.com always has "Example Domain" as h1
assert "Example Domain" in h1.get_text(), \
f"FAIL: expected 'Example Domain' in h1, got '{h1.get_text(strip=True)}'"
assert len(paragraphs) > 0, \
"FAIL: no <p> tags found"
assert len(links) > 0, \
"FAIL: no links found"
# verify link structure — href must be non-empty string
for link in links:
assert link["href"], \
f"FAIL: empty href on link: {link}"
# structured output — what an agent would return
extracted = {
"title": title.get_text(strip=True),
"h1": h1.get_text(strip=True),
"paragraph_count": len(paragraphs),
"links": [{"text": a.get_text(strip=True), "href": a["href"]} for a in links],
}
assert extracted["title"] == "Example Domain", \
f"FAIL: title mismatch: {extracted['title']}"
print()
print("POST_EXECUTION: title verified ✓")
print("POST_EXECUTION: h1 verified ✓")
print(f"POST_EXECUTION: {extracted['paragraph_count']} paragraphs found ✓")
print(f"POST_EXECUTION: {len(extracted['links'])} links extracted ✓")
result = {
"status": "pass",
"page_fetched": True,
"title_verified": True,
"h1_verified": True,
"paragraph_count": extracted["paragraph_count"],
"link_count": len(extracted["links"]),
}
print(result)
print("PASS")