Run pipeline, write Post 08, commit untracked files

Pipeline: - Extract ideas for 38 new drafts → 462 ideas total - Convergence analysis: 132 cross-org convergent ideas (33% rate) - Fetch authors for 102 drafts → 709 authors (up from 403) - Refresh gap analysis: 12 gaps across full 474-draft corpus - Update verified counts with new totals Post 08: - Complete rewrite of "Agents Building the Agent Analysis" (2,953 words) - Covers 3 phases: writing team → review cycle → fix cycle - Meta-irony table mapping team coordination to IETF gap names - Specific examples from dev journal (SQL injection, consent conflation, ideas mismatch) Untracked files committed: - scripts/: backfill-wg-names, classify-unrated, compare-classifiers, download-relevant-text, run-webui - src/ietf_analyzer/classifier.py: two-stage Ollama classifier - src/webui/: analytics (GDPR-compliant), auth, obsidian_export - tests/test_obsidian_export.py (10 tests) - data/reports/: wg-analysis, generated draft for gap #37 Housekeeping: - .gitignore: exclude LaTeX artifacts, stale DBs, analytics.db Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 15:31:30 +01:00
parent 20c45a7eba
commit e247bfef8f
19 changed files with 2758 additions and 586 deletions
--- a/scripts/download-relevant-text.py
+++ b/scripts/download-relevant-text.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""Download full text for the 9 classifier-relevant unrated drafts."""
+
+import sqlite3
+import time
+import sys
+sys.path.insert(0, "src")
+
+import httpx
+from ietf_analyzer.config import Config
+
+cfg = Config.load()
+conn = sqlite3.connect(cfg.db_path)
+conn.row_factory = sqlite3.Row
+
+# The 9 relevant drafts from classifier
+relevant_names = [
+    "draft-bondar-wca",
+    "draft-latour-pre-registration",
+    "draft-li-trustworthy-routing-discovery",
+    "draft-scrm-aiproto-usecases",
+    "draft-song-dmsc-problem-statement",
+    "draft-wiethuechter-drip-det-moc",
+    "draft-wiethuechter-drip-det-tada",
+    "draft-zzn-dvs",
+    "w3c-cuap",
+]
+
+client = httpx.Client(timeout=30, follow_redirects=True)
+
+for name in relevant_names:
+    row = conn.execute("SELECT name, rev, source, source_url, full_text FROM drafts WHERE name=?", (name,)).fetchone()
+    if not row:
+        print(f"  SKIP {name}: not in DB")
+        continue
+    if row["full_text"]:
+        print(f"  SKIP {name}: already has text")
+        continue
+
+    if row["source"] == "w3c":
+        url = row["source_url"] or ""
+        if not url:
+            print(f"  SKIP {name}: no source_url for W3C doc")
+            continue
+    else:
+        rev = row["rev"] or "00"
+        url = f"https://www.ietf.org/archive/id/{name}-{rev}.txt"
+
+    print(f"  Fetching {name} from {url}...")
+    try:
+        resp = client.get(url)
+        if resp.status_code == 200:
+            text = resp.text[:500000]  # cap at 500K
+            conn.execute("UPDATE drafts SET full_text=? WHERE name=?", (text, name))
+            conn.commit()
+            print(f"    OK ({len(text)} chars)")
+        else:
+            print(f"    FAIL: HTTP {resp.status_code}")
+    except Exception as e:
+        print(f"    ERROR: {e}")
+    time.sleep(0.5)
+
+client.close()
+conn.close()
+print("\nDone.")