#!/usr/bin/env python3 """Download full text for the 9 classifier-relevant unrated drafts.""" import sqlite3 import time import sys sys.path.insert(0, "src") import httpx from ietf_analyzer.config import Config cfg = Config.load() conn = sqlite3.connect(cfg.db_path) conn.row_factory = sqlite3.Row # The 9 relevant drafts from classifier relevant_names = [ "draft-bondar-wca", "draft-latour-pre-registration", "draft-li-trustworthy-routing-discovery", "draft-scrm-aiproto-usecases", "draft-song-dmsc-problem-statement", "draft-wiethuechter-drip-det-moc", "draft-wiethuechter-drip-det-tada", "draft-zzn-dvs", "w3c-cuap", ] client = httpx.Client(timeout=30, follow_redirects=True) for name in relevant_names: row = conn.execute("SELECT name, rev, source, source_url, full_text FROM drafts WHERE name=?", (name,)).fetchone() if not row: print(f" SKIP {name}: not in DB") continue if row["full_text"]: print(f" SKIP {name}: already has text") continue if row["source"] == "w3c": url = row["source_url"] or "" if not url: print(f" SKIP {name}: no source_url for W3C doc") continue else: rev = row["rev"] or "00" url = f"https://www.ietf.org/archive/id/{name}-{rev}.txt" print(f" Fetching {name} from {url}...") try: resp = client.get(url) if resp.status_code == 200: text = resp.text[:500000] # cap at 500K conn.execute("UPDATE drafts SET full_text=? WHERE name=?", (text, name)) conn.commit() print(f" OK ({len(text)} chars)") else: print(f" FAIL: HTTP {resp.status_code}") except Exception as e: print(f" ERROR: {e}") time.sleep(0.5) client.close() conn.close() print("\nDone.")