ietf-draft-analyzer/scripts/download-relevant-text.py

#!/usr/bin/env python3
"""Download full text for the 9 classifier-relevant unrated drafts."""

import sqlite3
import time
import sys
sys.path.insert(0, "src")

import httpx
from ietf_analyzer.config import Config

cfg = Config.load()
conn = sqlite3.connect(cfg.db_path)
conn.row_factory = sqlite3.Row

# The 9 relevant drafts from classifier
relevant_names = [
    "draft-bondar-wca",
    "draft-latour-pre-registration",
    "draft-li-trustworthy-routing-discovery",
    "draft-scrm-aiproto-usecases",
    "draft-song-dmsc-problem-statement",
    "draft-wiethuechter-drip-det-moc",
    "draft-wiethuechter-drip-det-tada",
    "draft-zzn-dvs",
    "w3c-cuap",
]

client = httpx.Client(timeout=30, follow_redirects=True)

for name in relevant_names:
    row = conn.execute("SELECT name, rev, source, source_url, full_text FROM drafts WHERE name=?", (name,)).fetchone()
    if not row:
        print(f"  SKIP {name}: not in DB")
        continue
    if row["full_text"]:
        print(f"  SKIP {name}: already has text")
        continue

    if row["source"] == "w3c":
        url = row["source_url"] or ""
        if not url:
            print(f"  SKIP {name}: no source_url for W3C doc")
            continue
    else:
        rev = row["rev"] or "00"
        url = f"https://www.ietf.org/archive/id/{name}-{rev}.txt"

    print(f"  Fetching {name} from {url}...")
    try:
        resp = client.get(url)
        if resp.status_code == 200:
            text = resp.text[:500000]  # cap at 500K
            conn.execute("UPDATE drafts SET full_text=? WHERE name=?", (text, name))
            conn.commit()
            print(f"    OK ({len(text)} chars)")
        else:
            print(f"    FAIL: HTTP {resp.status_code}")
    except Exception as e:
        print(f"    ERROR: {e}")
    time.sleep(0.5)

client.close()
conn.close()
print("\nDone.")