v0.3.0: Gap-to-Draft pipeline, Living Standards Observatory, blog series

Gap-to-Draft Pipeline (ietf pipeline): - Context builder assembles ideas, RFC foundations, similar drafts, ecosystem vision - Generator produces outlines + sections using rich context with Claude - Quality gates: novelty (embedding similarity), references, format, self-rating - Family coordinator generates 5-draft ecosystem (AEM/ATD/HITL/AEPB/APAE) - I-D formatter with proper headers, references, 72-char wrapping Living Standards Observatory (ietf observatory): - Source abstraction with IETF + W3C fetchers - 7-step update pipeline: snapshot, fetch, analyze, embed, ideas, gaps, record - Static GitHub Pages dashboard (explorer, gap tracker, timeline) - Weekly CI/CD automation via GitHub Actions Also includes: - 361 drafts (expanded from 260 with 6 new keywords), 403 authors, 1,262 ideas, 12 gaps - Blog series (8 posts planned), reports, arXiv paper figures - Agent team infrastructure (CLAUDE.md, scripts, dev journal) - 5 new DB tables, schema migration, ~15 new query methods Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 00:48:57 +01:00
parent be9cf9c5d9
commit d6beb9c0a0
87 changed files with 24471 additions and 401 deletions
--- a/src/ietf_analyzer/db.py
+++ b/src/ietf_analyzer/db.py
@@ -10,7 +10,7 @@ from pathlib import Path
 import numpy as np

 from .config import Config
-from .models import Author, Draft, Rating
+from .models import Author, Draft, Rating, normalize_category

 SCHEMA = """
 CREATE TABLE IF NOT EXISTS drafts (
@@ -117,6 +117,73 @@ CREATE TABLE IF NOT EXISTS gaps (
    analyzed_at TEXT
 );

+-- Cross-references (RFC, draft, BCP references found in draft text)
+CREATE TABLE IF NOT EXISTS draft_refs (
+    draft_name TEXT NOT NULL REFERENCES drafts(name),
+    ref_type TEXT NOT NULL,             -- 'rfc', 'draft', 'bcp'
+    ref_id TEXT NOT NULL,               -- e.g. '8259', 'draft-ietf-httpbis-semantics', 'BCP14'
+    UNIQUE(draft_name, ref_type, ref_id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_draft_refs_ref ON draft_refs(ref_type, ref_id);
+
+-- Generated drafts from gap-to-draft pipeline
+CREATE TABLE IF NOT EXISTS generated_drafts (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    gap_topic TEXT NOT NULL,
+    draft_name TEXT NOT NULL,
+    title TEXT NOT NULL,
+    abstract TEXT NOT NULL DEFAULT '',
+    outline_json TEXT DEFAULT '{}',
+    sections_json TEXT DEFAULT '[]',
+    full_text TEXT,
+    family_name TEXT DEFAULT '',
+    family_role TEXT DEFAULT '',
+    version INTEGER DEFAULT 0,
+    rating_json TEXT DEFAULT '{}',
+    novelty_score REAL DEFAULT 0.0,
+    quality_score REAL DEFAULT 0.0,
+    status TEXT DEFAULT 'draft',
+    created_at TEXT
+);
+
+CREATE TABLE IF NOT EXISTS generation_runs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    family_name TEXT DEFAULT '',
+    gap_ids TEXT DEFAULT '[]',
+    total_input_tokens INTEGER DEFAULT 0,
+    total_output_tokens INTEGER DEFAULT 0,
+    model_used TEXT DEFAULT '',
+    status TEXT DEFAULT 'running',
+    started_at TEXT,
+    completed_at TEXT
+);
+
+-- Observatory tables
+CREATE TABLE IF NOT EXISTS sources (
+    name TEXT PRIMARY KEY,
+    last_fetch TEXT,
+    doc_count INTEGER DEFAULT 0
+);
+
+CREATE TABLE IF NOT EXISTS observatory_snapshots (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    snapshot_at TEXT NOT NULL,
+    total_docs INTEGER DEFAULT 0,
+    new_since_last INTEGER DEFAULT 0,
+    changed_gaps INTEGER DEFAULT 0
+);
+
+CREATE TABLE IF NOT EXISTS gap_history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    snapshot_id INTEGER REFERENCES observatory_snapshots(id),
+    gap_topic TEXT NOT NULL,
+    gap_description TEXT NOT NULL,
+    severity TEXT DEFAULT 'medium',
+    status TEXT DEFAULT 'open',
+    recorded_at TEXT
+);
+
 -- Triggers to keep FTS index in sync
 CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
    INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
@@ -152,8 +219,23 @@ class Database:
            self._conn.execute("PRAGMA journal_mode=WAL")
            self._conn.execute("PRAGMA foreign_keys=ON")
            self._conn.executescript(SCHEMA)
+            self._migrate_schema()
        return self._conn

+    def _migrate_schema(self) -> None:
+        """Additive migration — add columns if missing."""
+        cols = {r[1] for r in self._conn.execute("PRAGMA table_info(drafts)").fetchall()}
+        migrations = [
+            ("source", "TEXT DEFAULT 'ietf'"),
+            ("source_id", "TEXT DEFAULT ''"),
+            ("source_url", "TEXT DEFAULT ''"),
+            ("doc_status", "TEXT DEFAULT ''"),
+        ]
+        for col, typedef in migrations:
+            if col not in cols:
+                self._conn.execute(f"ALTER TABLE drafts ADD COLUMN {col} {typedef}")
+        self._conn.commit()
+
    def close(self) -> None:
        if self._conn:
            self._conn.close()
@@ -303,7 +385,7 @@ class Database:
                novelty_note=r["novelty_note"], maturity_note=r["maturity_note"],
                overlap_note=r["overlap_note"], momentum_note=r["momentum_note"],
                relevance_note=r["relevance_note"],
-                categories=json.loads(r["r_categories"]) if r["r_categories"] else [],
+                categories=[normalize_category(c) for c in json.loads(r["r_categories"])] if r["r_categories"] else [],
                rated_at=r["rated_at"],
            )
            results.append((draft, rating))
@@ -503,6 +585,30 @@ class Database:
        ).fetchall()
        return [(r["org_a"], r["org_b"], r["shared"]) for r in rows]

+    def org_data_raw(self) -> list[tuple[str, int, str]]:
+        """Return (affiliation, person_id, draft_name) for all draft_authors with affiliation."""
+        rows = self.conn.execute(
+            "SELECT affiliation, person_id, draft_name FROM draft_authors WHERE affiliation != ''"
+        ).fetchall()
+        return [(r[0], r[1], r[2]) for r in rows]
+
+    def author_draft_counts(self) -> dict[int, int]:
+        """Return {person_id: draft_count} for all authors."""
+        rows = self.conn.execute(
+            "SELECT person_id, COUNT(*) FROM draft_authors GROUP BY person_id"
+        ).fetchall()
+        return {r[0]: r[1] for r in rows}
+
+    def author_draft_sets(self) -> dict[int, set[str]]:
+        """Return {person_id: set(draft_names)} for all authors."""
+        rows = self.conn.execute(
+            "SELECT person_id, draft_name FROM draft_authors"
+        ).fetchall()
+        result: dict[int, set[str]] = {}
+        for r in rows:
+            result.setdefault(r[0], set()).add(r[1])
+        return result
+
    # --- Ideas ---

    def insert_ideas(self, draft_name: str, ideas: list[dict]) -> None:
@@ -529,7 +635,9 @@ class Database:
        rows = self.conn.execute(
            """SELECT d.name FROM drafts d
            LEFT JOIN ideas i ON d.name = i.draft_name
-            WHERE i.draft_name IS NULL
+            LEFT JOIN llm_cache lc ON d.name = lc.draft_name
+                AND lc.request_json LIKE 'batch-ideas[%'
+            WHERE i.draft_name IS NULL AND lc.draft_name IS NULL
            LIMIT ?""",
            (limit,),
        ).fetchall()
@@ -565,6 +673,314 @@ class Database:
                 "category": r["category"], "evidence": r["evidence"],
                 "severity": r["severity"]} for r in rows]

+    # --- Refs ---
+
+    def insert_refs(self, draft_name: str, refs: list[tuple[str, str]]) -> None:
+        """Insert cross-references for a draft. refs = [(ref_type, ref_id), ...]."""
+        for ref_type, ref_id in refs:
+            self.conn.execute(
+                """INSERT OR IGNORE INTO draft_refs (draft_name, ref_type, ref_id)
+                VALUES (?, ?, ?)""",
+                (draft_name, ref_type, ref_id),
+            )
+        self.conn.commit()
+
+    def get_refs_for_draft(self, draft_name: str) -> list[tuple[str, str]]:
+        """Return [(ref_type, ref_id)] for a draft."""
+        rows = self.conn.execute(
+            "SELECT ref_type, ref_id FROM draft_refs WHERE draft_name = ?",
+            (draft_name,),
+        ).fetchall()
+        return [(r["ref_type"], r["ref_id"]) for r in rows]
+
+    def top_referenced(self, ref_type: str = "rfc", limit: int = 30) -> list[tuple[str, int, list[str]]]:
+        """Return (ref_id, count, [draft_names]) for most-referenced items."""
+        rows = self.conn.execute(
+            """SELECT ref_id, COUNT(*) as cnt,
+                GROUP_CONCAT(draft_name, '||') as drafts
+            FROM draft_refs
+            WHERE ref_type = ?
+            GROUP BY ref_id
+            ORDER BY cnt DESC
+            LIMIT ?""",
+            (ref_type, limit),
+        ).fetchall()
+        return [
+            (r["ref_id"], r["cnt"], r["drafts"].split("||") if r["drafts"] else [])
+            for r in rows
+        ]
+
+    def drafts_referencing(self, ref_type: str, ref_id: str) -> list[str]:
+        """Return draft names that reference a specific RFC/draft/BCP."""
+        rows = self.conn.execute(
+            "SELECT draft_name FROM draft_refs WHERE ref_type = ? AND ref_id = ?",
+            (ref_type, ref_id),
+        ).fetchall()
+        return [r["draft_name"] for r in rows]
+
+    def ref_counts_by_draft(self) -> list[tuple[str, int, int, int]]:
+        """Return (draft_name, rfc_count, draft_count, bcp_count) for all drafts with refs."""
+        rows = self.conn.execute(
+            """SELECT draft_name,
+                SUM(CASE WHEN ref_type = 'rfc' THEN 1 ELSE 0 END) as rfcs,
+                SUM(CASE WHEN ref_type = 'draft' THEN 1 ELSE 0 END) as drafts,
+                SUM(CASE WHEN ref_type = 'bcp' THEN 1 ELSE 0 END) as bcps
+            FROM draft_refs
+            GROUP BY draft_name
+            ORDER BY rfcs DESC"""
+        ).fetchall()
+        return [(r["draft_name"], r["rfcs"], r["drafts"], r["bcps"]) for r in rows]
+
+    def drafts_without_refs(self, limit: int = 500) -> list[str]:
+        """Return draft names that have full_text but no refs extracted yet."""
+        rows = self.conn.execute(
+            """SELECT d.name FROM drafts d
+            LEFT JOIN draft_refs dr ON d.name = dr.draft_name
+            WHERE d.full_text IS NOT NULL AND dr.draft_name IS NULL
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [r["name"] for r in rows]
+
+    def ref_stats(self) -> dict:
+        """Return summary stats for refs table."""
+        row = self.conn.execute(
+            """SELECT COUNT(DISTINCT draft_name) as drafts_with_refs,
+                COUNT(*) as total_refs,
+                SUM(CASE WHEN ref_type = 'rfc' THEN 1 ELSE 0 END) as rfc_refs,
+                SUM(CASE WHEN ref_type = 'draft' THEN 1 ELSE 0 END) as draft_refs,
+                SUM(CASE WHEN ref_type = 'bcp' THEN 1 ELSE 0 END) as bcp_refs,
+                COUNT(DISTINCT ref_id) as unique_refs
+            FROM draft_refs"""
+        ).fetchone()
+        return dict(row)
+
+    # --- Generated Drafts ---
+
+    def upsert_generated_draft(self, data: dict) -> int:
+        """Insert or update a generated draft. Returns row id."""
+        now = datetime.now(timezone.utc).isoformat()
+        existing = self.conn.execute(
+            "SELECT id FROM generated_drafts WHERE draft_name = ? AND version = ?",
+            (data["draft_name"], data.get("version", 0)),
+        ).fetchone()
+        if existing:
+            self.conn.execute(
+                """UPDATE generated_drafts SET
+                    gap_topic=?, title=?, abstract=?, outline_json=?,
+                    sections_json=?, full_text=?, family_name=?, family_role=?,
+                    rating_json=?, novelty_score=?, quality_score=?, status=?
+                WHERE id=?""",
+                (data["gap_topic"], data["title"], data.get("abstract", ""),
+                 json.dumps(data.get("outline", {})), json.dumps(data.get("sections", [])),
+                 data.get("full_text"), data.get("family_name", ""),
+                 data.get("family_role", ""), json.dumps(data.get("rating", {})),
+                 data.get("novelty_score", 0.0), data.get("quality_score", 0.0),
+                 data.get("status", "draft"), existing["id"]),
+            )
+            self.conn.commit()
+            return existing["id"]
+        else:
+            cur = self.conn.execute(
+                """INSERT INTO generated_drafts
+                    (gap_topic, draft_name, title, abstract, outline_json, sections_json,
+                     full_text, family_name, family_role, version, rating_json,
+                     novelty_score, quality_score, status, created_at)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                (data["gap_topic"], data["draft_name"], data["title"],
+                 data.get("abstract", ""), json.dumps(data.get("outline", {})),
+                 json.dumps(data.get("sections", [])), data.get("full_text"),
+                 data.get("family_name", ""), data.get("family_role", ""),
+                 data.get("version", 0), json.dumps(data.get("rating", {})),
+                 data.get("novelty_score", 0.0), data.get("quality_score", 0.0),
+                 data.get("status", "draft"), now),
+            )
+            self.conn.commit()
+            return cur.lastrowid
+
+    def get_generated_drafts(self, status: str | None = None) -> list[dict]:
+        query = "SELECT * FROM generated_drafts"
+        params: list = []
+        if status:
+            query += " WHERE status = ?"
+            params.append(status)
+        query += " ORDER BY created_at DESC"
+        rows = self.conn.execute(query, params).fetchall()
+        return [dict(r) for r in rows]
+
+    def get_generated_draft(self, draft_id: int) -> dict | None:
+        row = self.conn.execute(
+            "SELECT * FROM generated_drafts WHERE id = ?", (draft_id,)
+        ).fetchone()
+        return dict(row) if row else None
+
+    def get_family_drafts(self, family_name: str) -> list[dict]:
+        rows = self.conn.execute(
+            "SELECT * FROM generated_drafts WHERE family_name = ? ORDER BY family_role",
+            (family_name,),
+        ).fetchall()
+        return [dict(r) for r in rows]
+
+    def log_generation_run(self, data: dict) -> int:
+        now = datetime.now(timezone.utc).isoformat()
+        cur = self.conn.execute(
+            """INSERT INTO generation_runs
+                (family_name, gap_ids, total_input_tokens, total_output_tokens,
+                 model_used, status, started_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?)""",
+            (data.get("family_name", ""), json.dumps(data.get("gap_ids", [])),
+             data.get("total_input_tokens", 0), data.get("total_output_tokens", 0),
+             data.get("model_used", ""), data.get("status", "running"), now),
+        )
+        self.conn.commit()
+        return cur.lastrowid
+
+    def update_generation_run(self, run_id: int, **kwargs) -> None:
+        sets = []
+        params = []
+        for k, v in kwargs.items():
+            sets.append(f"{k} = ?")
+            params.append(v)
+        if not sets:
+            return
+        params.append(run_id)
+        self.conn.execute(
+            f"UPDATE generation_runs SET {', '.join(sets)} WHERE id = ?", params
+        )
+        self.conn.commit()
+
+    # --- Observatory ---
+
+    def upsert_source(self, name: str, doc_count: int = 0) -> None:
+        now = datetime.now(timezone.utc).isoformat()
+        self.conn.execute(
+            """INSERT INTO sources (name, last_fetch, doc_count)
+            VALUES (?, ?, ?)
+            ON CONFLICT(name) DO UPDATE SET last_fetch=excluded.last_fetch, doc_count=excluded.doc_count""",
+            (name, now, doc_count),
+        )
+        self.conn.commit()
+
+    def get_source(self, name: str) -> dict | None:
+        row = self.conn.execute("SELECT * FROM sources WHERE name = ?", (name,)).fetchone()
+        return dict(row) if row else None
+
+    def all_sources(self) -> list[dict]:
+        rows = self.conn.execute("SELECT * FROM sources ORDER BY name").fetchall()
+        return [dict(r) for r in rows]
+
+    def create_snapshot(self) -> int:
+        now = datetime.now(timezone.utc).isoformat()
+        total = self.count_drafts()
+        # Count new since last snapshot
+        last = self.conn.execute(
+            "SELECT snapshot_at FROM observatory_snapshots ORDER BY id DESC LIMIT 1"
+        ).fetchone()
+        new_count = 0
+        if last:
+            new_count = self.conn.execute(
+                "SELECT COUNT(*) FROM drafts WHERE fetched_at > ?", (last["snapshot_at"],)
+            ).fetchone()[0]
+        else:
+            new_count = total
+        cur = self.conn.execute(
+            """INSERT INTO observatory_snapshots (snapshot_at, total_docs, new_since_last, changed_gaps)
+            VALUES (?, ?, ?, 0)""",
+            (now, total, new_count),
+        )
+        self.conn.commit()
+        return cur.lastrowid
+
+    def record_gap_history(self, snapshot_id: int, gaps: list[dict]) -> None:
+        now = datetime.now(timezone.utc).isoformat()
+        for g in gaps:
+            self.conn.execute(
+                """INSERT INTO gap_history (snapshot_id, gap_topic, gap_description, severity, status, recorded_at)
+                VALUES (?, ?, ?, ?, ?, ?)""",
+                (snapshot_id, g["topic"], g["description"],
+                 g.get("severity", "medium"), g.get("status", "open"), now),
+            )
+        self.conn.commit()
+
+    def gap_history_timeline(self) -> list[dict]:
+        rows = self.conn.execute(
+            """SELECT gh.*, os.snapshot_at FROM gap_history gh
+            JOIN observatory_snapshots os ON gh.snapshot_id = os.id
+            ORDER BY os.snapshot_at, gh.gap_topic"""
+        ).fetchall()
+        return [dict(r) for r in rows]
+
+    def get_snapshots(self, limit: int = 20) -> list[dict]:
+        rows = self.conn.execute(
+            "SELECT * FROM observatory_snapshots ORDER BY id DESC LIMIT ?", (limit,)
+        ).fetchall()
+        return [dict(r) for r in rows]
+
+    def drafts_by_source(self, source: str, limit: int = 500) -> list[Draft]:
+        rows = self.conn.execute(
+            "SELECT * FROM drafts WHERE source = ? ORDER BY time DESC LIMIT ?",
+            (source, limit),
+        ).fetchall()
+        return [self._row_to_draft(r) for r in rows]
+
+    # --- WG/Status ---
+
+    def draft_adoption_status(self) -> list[dict]:
+        """Return adoption status for all drafts based on naming convention.
+
+        Returns list of dicts: {name, title, time, wg_adopted, wg_name, stream}
+        """
+        import re
+        rows = self.conn.execute(
+            'SELECT name, title, time FROM drafts'
+        ).fetchall()
+        results = []
+        for r in rows:
+            name = r["name"]
+            wg_adopted = False
+            wg_name = ""
+            stream = "individual"
+
+            # Primary signal: draft-ietf-{wg}-* naming convention
+            m = re.match(r'^draft-ietf-(\w+)-', name)
+            if m:
+                wg_adopted = True
+                wg_name = m.group(1)
+                stream = "ietf"
+            elif name.startswith("draft-irtf-"):
+                m2 = re.match(r'^draft-irtf-(\w+)-', name)
+                wg_name = m2.group(1) if m2 else ""
+                stream = "irtf"
+
+            results.append({
+                "name": name,
+                "title": r["title"],
+                "time": r["time"],
+                "wg_adopted": wg_adopted,
+                "wg_name": wg_name,
+                "stream": stream,
+            })
+        return results
+
+    def revision_velocity(self) -> list[dict]:
+        """Return revision data for all drafts.
+
+        Returns list of dicts: {name, title, time, rev, rev_int}
+        """
+        rows = self.conn.execute(
+            "SELECT name, title, time, rev FROM drafts"
+        ).fetchall()
+        return [
+            {
+                "name": r["name"],
+                "title": r["title"],
+                "time": r["time"],
+                "rev": r["rev"],
+                "rev_int": int(r["rev"]) if r["rev"].isdigit() else 0,
+            }
+            for r in rows
+        ]
+
    # --- Helpers ---

    @staticmethod
@@ -580,11 +996,16 @@ class Database:
            categories=json.loads(d.get("categories") or "[]"),
            tags=json.loads(d.get("tags") or "[]"),
            fetched_at=d.get("fetched_at"),
+            source=d.get("source", "ietf"),
+            source_id=d.get("source_id", ""),
+            source_url=d.get("source_url", ""),
+            doc_status=d.get("doc_status", ""),
        )

    @staticmethod
    def _row_to_rating(row: sqlite3.Row) -> Rating:
        d = dict(row)
+        raw_cats = json.loads(d.get("categories") or "[]")
        return Rating(
            draft_name=d["draft_name"], novelty=d["novelty"], maturity=d["maturity"],
            overlap=d["overlap"], momentum=d["momentum"], relevance=d["relevance"],
@@ -594,6 +1015,6 @@ class Database:
            overlap_note=d.get("overlap_note", ""),
            momentum_note=d.get("momentum_note", ""),
            relevance_note=d.get("relevance_note", ""),
-            categories=json.loads(d.get("categories") or "[]"),
+            categories=[normalize_category(c) for c in raw_cats],
            rated_at=d.get("rated_at"),
        )