Idea quality pipeline, web UI features, academic paper

- Tighten idea extraction prompts (1-4 ideas, no sub-features) reducing 1,907 ideas to 468 across 434 drafts (78% reduction) - Add embedding-based dedup (ietf dedup-ideas) for same-draft similarity - Add novelty scoring (ietf ideas score) and filtering (ietf ideas filter) using Claude to rate ideas 1-5, removing 49 generic building blocks - Final count: 419 high-quality ideas (avg 1.1/draft) - Web UI: gap explorer with live draft generation and pre-generated demos - Web UI: D3.js author collaboration network (498 nodes, 1142 edges, 68 clusters, org filtering, interactive zoom/pan) - Academic paper: 15-page LaTeX workshop paper analyzing the 434-draft AI agent standards landscape - Save improvement ideas backlog to data/reports/improvement-ideas.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 22:17:57 +01:00
parent 3c3d7e649f
commit 6e3a387778
29 changed files with 6575 additions and 240 deletions
--- a/src/ietf_analyzer/db.py
+++ b/src/ietf_analyzer/db.py
@@ -106,6 +106,14 @@ CREATE TABLE IF NOT EXISTS ideas (

 CREATE INDEX IF NOT EXISTS idx_ideas_draft ON ideas(draft_name);

+-- Idea embeddings (for clustering)
+CREATE TABLE IF NOT EXISTS idea_embeddings (
+    idea_id INTEGER PRIMARY KEY REFERENCES ideas(id),
+    model TEXT NOT NULL,
+    vector BLOB NOT NULL,
+    created_at TEXT
+);
+
 -- Gap analysis results
 CREATE TABLE IF NOT EXISTS gaps (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -184,6 +192,20 @@ CREATE TABLE IF NOT EXISTS gap_history (
    recorded_at TEXT
 );

+-- Monitor runs
+CREATE TABLE IF NOT EXISTS monitor_runs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    started_at TEXT NOT NULL,
+    completed_at TEXT,
+    status TEXT DEFAULT 'running',
+    new_drafts_found INTEGER DEFAULT 0,
+    drafts_analyzed INTEGER DEFAULT 0,
+    drafts_embedded INTEGER DEFAULT 0,
+    ideas_extracted INTEGER DEFAULT 0,
+    error_message TEXT DEFAULT '',
+    duration_seconds REAL DEFAULT 0
+);
+
 -- Triggers to keep FTS index in sync
 CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
    INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
@@ -234,6 +256,12 @@ class Database:
        for col, typedef in migrations:
            if col not in cols:
                self._conn.execute(f"ALTER TABLE drafts ADD COLUMN {col} {typedef}")
+
+        # ideas table migrations
+        idea_cols = {r[1] for r in self._conn.execute("PRAGMA table_info(ideas)").fetchall()}
+        if "novelty_score" not in idea_cols:
+            self._conn.execute("ALTER TABLE ideas ADD COLUMN novelty_score INTEGER")
+
        self._conn.commit()

    def close(self) -> None:
@@ -501,12 +529,13 @@ class Database:
            ORDER BY da.author_order""",
            (draft_name,),
        ).fetchall()
+        cols = rows[0].keys() if rows else []
        return [Author(
            person_id=r["person_id"], name=r["name"],
-            ascii_name=r.get("ascii_name", ""),
-            affiliation=r.get("affiliation", ""),
-            resource_uri=r.get("resource_uri", ""),
-            fetched_at=r.get("fetched_at"),
+            ascii_name=r["ascii_name"] if "ascii_name" in cols else "",
+            affiliation=r["affiliation"] if "affiliation" in cols else "",
+            resource_uri=r["resource_uri"] if "resource_uri" in cols else "",
+            fetched_at=r["fetched_at"] if "fetched_at" in cols else None,
        ) for r in rows]

    def drafts_without_authors(self, limit: int = 500) -> list[str]:
@@ -624,13 +653,42 @@ class Database:
            )
        self.conn.commit()

+    def delete_ideas(self, draft_name: str | None = None) -> int:
+        """Delete ideas from the ideas table.
+
+        Args:
+            draft_name: If provided, delete only ideas for this draft.
+                        If None, delete all ideas.
+
+        Returns:
+            Number of rows deleted.
+        """
+        if draft_name:
+            self.conn.execute(
+                "DELETE FROM idea_embeddings WHERE idea_id IN (SELECT id FROM ideas WHERE draft_name = ?)", (draft_name,)
+            )
+            cursor = self.conn.execute(
+                "DELETE FROM ideas WHERE draft_name = ?", (draft_name,)
+            )
+        else:
+            self.conn.execute("DELETE FROM idea_embeddings")
+            cursor = self.conn.execute("DELETE FROM ideas")
+        self.conn.commit()
+        return cursor.rowcount
+
    def get_ideas_for_draft(self, draft_name: str) -> list[dict]:
        rows = self.conn.execute(
            "SELECT * FROM ideas WHERE draft_name = ?", (draft_name,)
        ).fetchall()
-        return [{"title": r["title"], "description": r["description"],
+        return [{"id": r["id"], "title": r["title"], "description": r["description"],
                 "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows]

+    def delete_idea(self, idea_id: int) -> None:
+        """Delete a single idea and its embedding by ID."""
+        self.conn.execute("DELETE FROM idea_embeddings WHERE idea_id = ?", (idea_id,))
+        self.conn.execute("DELETE FROM ideas WHERE id = ?", (idea_id,))
+        self.conn.commit()
+
    def drafts_without_ideas(self, limit: int = 500) -> list[str]:
        rows = self.conn.execute(
            """SELECT d.name FROM drafts d
@@ -653,6 +711,103 @@ class Database:
    def idea_count(self) -> int:
        return self.conn.execute("SELECT COUNT(*) FROM ideas").fetchone()[0]

+    def ideas_with_drafts(self, unscored_only: bool = False, limit: int = 5000) -> list[dict]:
+        """Return ideas joined with draft title, optionally only unscored ones."""
+        where = "WHERE i.novelty_score IS NULL" if unscored_only else ""
+        rows = self.conn.execute(
+            f"""SELECT i.id, i.draft_name, i.title, i.description, i.idea_type,
+                       i.novelty_score, d.title AS draft_title
+            FROM ideas i JOIN drafts d ON i.draft_name = d.name
+            {where}
+            ORDER BY i.id LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [dict(r) for r in rows]
+
+    def update_idea_score(self, idea_id: int, score: int) -> None:
+        """Set the novelty_score for a single idea."""
+        self.conn.execute(
+            "UPDATE ideas SET novelty_score = ? WHERE id = ?",
+            (score, idea_id),
+        )
+        self.conn.commit()
+
+    def update_idea_scores_bulk(self, scores: dict[int, int]) -> None:
+        """Bulk-update novelty scores. scores maps idea_id -> score."""
+        self.conn.executemany(
+            "UPDATE ideas SET novelty_score = ? WHERE id = ?",
+            [(score, idea_id) for idea_id, score in scores.items()],
+        )
+        self.conn.commit()
+
+    def delete_low_score_ideas(self, min_score: int) -> int:
+        """Delete ideas with novelty_score below min_score. Returns count deleted."""
+        # Also clean up associated idea embeddings
+        self.conn.execute(
+            """DELETE FROM idea_embeddings WHERE idea_id IN
+               (SELECT id FROM ideas WHERE novelty_score IS NOT NULL AND novelty_score < ?)""",
+            (min_score,),
+        )
+        cursor = self.conn.execute(
+            "DELETE FROM ideas WHERE novelty_score IS NOT NULL AND novelty_score < ?",
+            (min_score,),
+        )
+        self.conn.commit()
+        return cursor.rowcount
+
+    def idea_score_distribution(self) -> dict[int, int]:
+        """Return {score: count} for scored ideas."""
+        rows = self.conn.execute(
+            "SELECT novelty_score, COUNT(*) as cnt FROM ideas "
+            "WHERE novelty_score IS NOT NULL GROUP BY novelty_score ORDER BY novelty_score"
+        ).fetchall()
+        return {r["novelty_score"]: r["cnt"] for r in rows}
+
+    def ideas_below_score(self, min_score: int) -> list[dict]:
+        """Return ideas with novelty_score below min_score."""
+        rows = self.conn.execute(
+            """SELECT i.id, i.draft_name, i.title, i.description, i.novelty_score,
+                      d.title AS draft_title
+            FROM ideas i JOIN drafts d ON i.draft_name = d.name
+            WHERE i.novelty_score IS NOT NULL AND i.novelty_score < ?
+            ORDER BY i.novelty_score, i.title""",
+            (min_score,),
+        ).fetchall()
+        return [dict(r) for r in rows]
+
+    # --- Idea Embeddings ---
+
+    def store_idea_embedding(self, idea_id: int, model: str, vector: np.ndarray) -> None:
+        self.conn.execute(
+            """INSERT INTO idea_embeddings (idea_id, model, vector, created_at)
+            VALUES (?, ?, ?, ?)
+            ON CONFLICT(idea_id) DO UPDATE SET
+                model=excluded.model, vector=excluded.vector, created_at=excluded.created_at
+            """,
+            (idea_id, model, vector.astype(np.float32).tobytes(),
+             datetime.now(timezone.utc).isoformat()),
+        )
+        self.conn.commit()
+
+    def all_idea_embeddings(self) -> dict[int, np.ndarray]:
+        rows = self.conn.execute("SELECT idea_id, vector FROM idea_embeddings").fetchall()
+        return {
+            r["idea_id"]: np.frombuffer(r["vector"], dtype=np.float32)
+            for r in rows
+        }
+
+    def ideas_without_embeddings(self, limit: int = 500) -> list[dict]:
+        rows = self.conn.execute(
+            """SELECT i.id, i.title, i.description, i.idea_type, i.draft_name
+            FROM ideas i
+            LEFT JOIN idea_embeddings ie ON i.id = ie.idea_id
+            WHERE ie.idea_id IS NULL
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [{"id": r["id"], "title": r["title"], "description": r["description"],
+                 "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows]
+
    # --- Gaps ---

    def insert_gaps(self, gaps: list[dict]) -> None:
@@ -981,6 +1136,250 @@ class Database:
            for r in rows
        ]

+    # --- Working Groups ---
+
+    def wg_summary(self) -> list[dict]:
+        """Return per-WG summary: group, draft_count, avg scores, categories, idea_count.
+
+        Excludes 'none' (individual submissions) — those are returned separately.
+        """
+        rows = self.conn.execute("""
+            SELECT d."group" as wg, COUNT(*) as draft_count,
+                AVG(r.novelty) as avg_novelty, AVG(r.maturity) as avg_maturity,
+                AVG(r.overlap) as avg_overlap, AVG(r.momentum) as avg_momentum,
+                AVG(r.relevance) as avg_relevance,
+                (SELECT COUNT(*) FROM ideas i WHERE i.draft_name IN
+                    (SELECT name FROM drafts WHERE "group" = d."group")) as idea_count
+            FROM drafts d
+            LEFT JOIN ratings r ON d.name = r.draft_name
+            WHERE d."group" IS NOT NULL AND d."group" != '' AND d."group" != 'none'
+            GROUP BY d."group"
+            ORDER BY draft_count DESC
+        """).fetchall()
+
+        # Build categories per WG from a separate query
+        cat_rows = self.conn.execute("""
+            SELECT d."group" as wg, r.categories
+            FROM drafts d JOIN ratings r ON d.name = r.draft_name
+            WHERE d."group" IS NOT NULL AND d."group" != '' AND d."group" != 'none'
+        """).fetchall()
+        wg_cats: dict[str, dict[str, int]] = {}
+        for cr in cat_rows:
+            wg = cr["wg"]
+            if wg not in wg_cats:
+                wg_cats[wg] = {}
+            try:
+                for c in json.loads(cr["categories"]):
+                    c = normalize_category(c)
+                    wg_cats[wg][c] = wg_cats[wg].get(c, 0) + 1
+            except (json.JSONDecodeError, TypeError):
+                pass
+
+        results = []
+        for r in rows:
+            results.append({
+                "wg": r["wg"],
+                "draft_count": r["draft_count"],
+                "avg_novelty": round(r["avg_novelty"] or 0, 1),
+                "avg_maturity": round(r["avg_maturity"] or 0, 1),
+                "avg_overlap": round(r["avg_overlap"] or 0, 1),
+                "avg_momentum": round(r["avg_momentum"] or 0, 1),
+                "avg_relevance": round(r["avg_relevance"] or 0, 1),
+                "categories": wg_cats.get(r["wg"], {}),
+                "idea_count": r["idea_count"],
+            })
+        return results
+
+    def wg_drafts(self, wg: str) -> list[Draft]:
+        """Return all drafts for a specific working group."""
+        rows = self.conn.execute(
+            'SELECT * FROM drafts WHERE "group" = ? ORDER BY time DESC', (wg,)
+        ).fetchall()
+        return [self._row_to_draft(r) for r in rows]
+
+    def wg_category_matrix(self) -> dict[str, dict[str, int]]:
+        """Return {wg: {category: count}} matrix for all WGs (excluding 'none')."""
+        rows = self.conn.execute("""
+            SELECT d."group" as wg, r.categories
+            FROM drafts d
+            JOIN ratings r ON d.name = r.draft_name
+            WHERE d."group" IS NOT NULL AND d."group" != '' AND d."group" != 'none'
+        """).fetchall()
+        matrix: dict[str, dict[str, int]] = {}
+        for r in rows:
+            wg = r["wg"]
+            if wg not in matrix:
+                matrix[wg] = {}
+            try:
+                for c in json.loads(r["categories"]):
+                    c = normalize_category(c)
+                    matrix[wg][c] = matrix[wg].get(c, 0) + 1
+            except (json.JSONDecodeError, TypeError):
+                pass
+        return matrix
+
+    def wg_idea_overlap(self) -> list[dict]:
+        """Find ideas that appear across multiple WGs — signals for alignment.
+
+        Returns list of {idea_title, wgs: [{wg, draft_name, draft_title}], wg_count}.
+        """
+        rows = self.conn.execute("""
+            SELECT i.title as idea_title, i.description, d."group" as wg,
+                   d.name as draft_name, d.title as draft_title
+            FROM ideas i
+            JOIN drafts d ON i.draft_name = d.name
+            WHERE d."group" IS NOT NULL AND d."group" != ''
+            ORDER BY i.title, d."group"
+        """).fetchall()
+
+        # Group by idea title
+        from collections import defaultdict
+        idea_groups: dict[str, list[dict]] = defaultdict(list)
+        for r in rows:
+            idea_groups[r["idea_title"]].append({
+                "wg": r["wg"],
+                "draft_name": r["draft_name"],
+                "draft_title": r["draft_title"],
+            })
+
+        # Only keep ideas spanning 2+ distinct WGs
+        results = []
+        for title, entries in idea_groups.items():
+            wgs = set(e["wg"] for e in entries)
+            if len(wgs) >= 2:
+                results.append({
+                    "idea_title": title,
+                    "wgs": entries,
+                    "wg_count": len(wgs),
+                    "wg_names": sorted(wgs),
+                })
+        return sorted(results, key=lambda x: x["wg_count"], reverse=True)
+
+    def individual_vs_wg_categories(self) -> dict[str, dict[str, int]]:
+        """Compare category distribution: individual submissions vs WG-adopted.
+
+        Returns {"individual": {cat: count}, "wg_adopted": {cat: count}}.
+        """
+        rows = self.conn.execute("""
+            SELECT CASE WHEN d."group" = 'none' OR d."group" IS NULL THEN 'individual'
+                        ELSE 'wg_adopted' END as stream,
+                   r.categories
+            FROM drafts d
+            JOIN ratings r ON d.name = r.draft_name
+        """).fetchall()
+        result: dict[str, dict[str, int]] = {"individual": {}, "wg_adopted": {}}
+        for r in rows:
+            stream = r["stream"]
+            try:
+                for c in json.loads(r["categories"]):
+                    c = normalize_category(c)
+                    result[stream][c] = result[stream].get(c, 0) + 1
+            except (json.JSONDecodeError, TypeError):
+                pass
+        return result
+
+    def category_wg_spread(self) -> list[dict]:
+        """For each category, which WGs contribute drafts? High spread = alignment opportunity.
+
+        Returns [{category, wgs: [{wg, count}], wg_count, total_drafts}].
+        """
+        rows = self.conn.execute("""
+            SELECT d."group" as wg, r.categories
+            FROM drafts d
+            JOIN ratings r ON d.name = r.draft_name
+            WHERE d."group" IS NOT NULL AND d."group" != ''
+        """).fetchall()
+
+        from collections import defaultdict
+        cat_wgs: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
+        for r in rows:
+            wg = r["wg"]
+            try:
+                for c in json.loads(r["categories"]):
+                    c = normalize_category(c)
+                    cat_wgs[c][wg] += 1
+            except (json.JSONDecodeError, TypeError):
+                pass
+
+        results = []
+        for cat, wg_counts in cat_wgs.items():
+            wg_list = sorted(wg_counts.items(), key=lambda x: x[1], reverse=True)
+            results.append({
+                "category": cat,
+                "wgs": [{"wg": wg, "count": cnt} for wg, cnt in wg_list],
+                "wg_count": len(wg_list),
+                "total_drafts": sum(wg_counts.values()),
+            })
+        return sorted(results, key=lambda x: x["wg_count"], reverse=True)
+
+    # --- Monitor Runs ---
+
+    def start_monitor_run(self) -> int:
+        now = datetime.now(timezone.utc).isoformat()
+        cur = self.conn.execute(
+            "INSERT INTO monitor_runs (started_at, status) VALUES (?, 'running')",
+            (now,),
+        )
+        self.conn.commit()
+        return cur.lastrowid
+
+    def complete_monitor_run(self, run_id: int, stats: dict) -> None:
+        now = datetime.now(timezone.utc).isoformat()
+        started = self.conn.execute(
+            "SELECT started_at FROM monitor_runs WHERE id = ?", (run_id,)
+        ).fetchone()
+        duration = 0.0
+        if started:
+            try:
+                start_dt = datetime.fromisoformat(started["started_at"])
+                duration = (datetime.now(timezone.utc) - start_dt).total_seconds()
+            except (ValueError, TypeError):
+                pass
+        self.conn.execute(
+            """UPDATE monitor_runs SET
+                status='completed', completed_at=?,
+                new_drafts_found=?, drafts_analyzed=?,
+                drafts_embedded=?, ideas_extracted=?,
+                duration_seconds=?
+            WHERE id=?""",
+            (now, stats.get("new_drafts_found", 0), stats.get("drafts_analyzed", 0),
+             stats.get("drafts_embedded", 0), stats.get("ideas_extracted", 0),
+             duration, run_id),
+        )
+        self.conn.commit()
+
+    def fail_monitor_run(self, run_id: int, error: str) -> None:
+        now = datetime.now(timezone.utc).isoformat()
+        started = self.conn.execute(
+            "SELECT started_at FROM monitor_runs WHERE id = ?", (run_id,)
+        ).fetchone()
+        duration = 0.0
+        if started:
+            try:
+                start_dt = datetime.fromisoformat(started["started_at"])
+                duration = (datetime.now(timezone.utc) - start_dt).total_seconds()
+            except (ValueError, TypeError):
+                pass
+        self.conn.execute(
+            """UPDATE monitor_runs SET
+                status='failed', completed_at=?, error_message=?, duration_seconds=?
+            WHERE id=?""",
+            (now, error, duration, run_id),
+        )
+        self.conn.commit()
+
+    def get_monitor_runs(self, limit: int = 20) -> list[dict]:
+        rows = self.conn.execute(
+            "SELECT * FROM monitor_runs ORDER BY started_at DESC LIMIT ?", (limit,)
+        ).fetchall()
+        return [dict(r) for r in rows]
+
+    def get_last_successful_run(self) -> dict | None:
+        row = self.conn.execute(
+            "SELECT * FROM monitor_runs WHERE status='completed' ORDER BY started_at DESC LIMIT 1"
+        ).fetchone()
+        return dict(row) if row else None
+
    # --- Helpers ---

    @staticmethod