v0.2.0: visualizations, interactive browser, arXiv paper, gap analysis

New features: - 12 interactive visualizations (ietf viz): t-SNE landscape, similarity heatmap, score distributions, timeline, bubble explorer, radar charts, author network graph, category treemap, quality vs overlap, org bar chart, ideas chart, and interactive draft browser - Interactive draft browser (browser.html): filterable by category, keyword, score sliders with sortable table and expandable detail rows - arXiv paper (paper/main.tex): 13-page manuscript with all findings - Gap analysis: 12 identified under-addressed areas - Author network: collaboration graph, org contributions, cross-org analysis - Draft generation from gaps (ietf draft-gen) - Auto-load .env for API keys (python-dotenv) New modules: visualize.py, authors.py, draftgen.py New reports: timeline, overlap-matrix, authors, gaps New deps: plotly, matplotlib, seaborn, scipy, scikit-learn, networkx, python-dotenv Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 13:37:55 +01:00
parent f44f9265bd
commit be9cf9c5d9
32 changed files with 4447 additions and 4 deletions
--- a/src/ietf_analyzer/db.py
+++ b/src/ietf_analyzer/db.py
@@ -10,7 +10,7 @@ from pathlib import Path
 import numpy as np

 from .config import Config
-from .models import Draft, Rating
+from .models import Author, Draft, Rating

 SCHEMA = """
 CREATE TABLE IF NOT EXISTS drafts (
@@ -76,6 +76,47 @@ CREATE VIRTUAL TABLE IF NOT EXISTS drafts_fts USING fts5(
    content_rowid='rowid'
 );

+-- Authors (fetched from Datatracker)
+CREATE TABLE IF NOT EXISTS authors (
+    person_id INTEGER PRIMARY KEY,
+    name TEXT NOT NULL,
+    ascii_name TEXT,
+    affiliation TEXT DEFAULT '',
+    resource_uri TEXT,
+    fetched_at TEXT
+);
+
+CREATE TABLE IF NOT EXISTS draft_authors (
+    draft_name TEXT NOT NULL REFERENCES drafts(name),
+    person_id INTEGER NOT NULL REFERENCES authors(person_id),
+    author_order INTEGER DEFAULT 1,
+    affiliation TEXT DEFAULT '',
+    PRIMARY KEY (draft_name, person_id)
+);
+
+-- Extracted ideas
+CREATE TABLE IF NOT EXISTS ideas (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    draft_name TEXT NOT NULL REFERENCES drafts(name),
+    title TEXT NOT NULL,
+    description TEXT NOT NULL,
+    idea_type TEXT DEFAULT '',
+    extracted_at TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_ideas_draft ON ideas(draft_name);
+
+-- Gap analysis results
+CREATE TABLE IF NOT EXISTS gaps (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    topic TEXT NOT NULL,
+    description TEXT NOT NULL,
+    category TEXT DEFAULT '',
+    evidence TEXT DEFAULT '',
+    severity TEXT DEFAULT 'medium',
+    analyzed_at TEXT
+);
+
 -- Triggers to keep FTS index in sync
 CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
    INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
@@ -341,6 +382,189 @@ class Database:
        ).fetchone()
        return (row[0], row[1])

+    # --- Authors ---
+
+    def upsert_author(self, author: Author) -> None:
+        self.conn.execute(
+            """INSERT INTO authors (person_id, name, ascii_name, affiliation, resource_uri, fetched_at)
+            VALUES (?, ?, ?, ?, ?, ?)
+            ON CONFLICT(person_id) DO UPDATE SET
+                name=excluded.name, ascii_name=excluded.ascii_name,
+                affiliation=excluded.affiliation, resource_uri=excluded.resource_uri,
+                fetched_at=excluded.fetched_at
+            """,
+            (author.person_id, author.name, author.ascii_name,
+             author.affiliation, author.resource_uri, author.fetched_at),
+        )
+        self.conn.commit()
+
+    def upsert_draft_author(
+        self, draft_name: str, person_id: int, order: int = 1, affiliation: str = ""
+    ) -> None:
+        self.conn.execute(
+            """INSERT INTO draft_authors (draft_name, person_id, author_order, affiliation)
+            VALUES (?, ?, ?, ?)
+            ON CONFLICT(draft_name, person_id) DO UPDATE SET
+                author_order=excluded.author_order, affiliation=excluded.affiliation
+            """,
+            (draft_name, person_id, order, affiliation),
+        )
+        self.conn.commit()
+
+    def get_authors_for_draft(self, draft_name: str) -> list[Author]:
+        rows = self.conn.execute(
+            """SELECT a.* FROM authors a
+            JOIN draft_authors da ON a.person_id = da.person_id
+            WHERE da.draft_name = ?
+            ORDER BY da.author_order""",
+            (draft_name,),
+        ).fetchall()
+        return [Author(
+            person_id=r["person_id"], name=r["name"],
+            ascii_name=r.get("ascii_name", ""),
+            affiliation=r.get("affiliation", ""),
+            resource_uri=r.get("resource_uri", ""),
+            fetched_at=r.get("fetched_at"),
+        ) for r in rows]
+
+    def drafts_without_authors(self, limit: int = 500) -> list[str]:
+        rows = self.conn.execute(
+            """SELECT d.name FROM drafts d
+            LEFT JOIN draft_authors da ON d.name = da.draft_name
+            WHERE da.draft_name IS NULL
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [r["name"] for r in rows]
+
+    def author_count(self) -> int:
+        return self.conn.execute("SELECT COUNT(*) FROM authors").fetchone()[0]
+
+    def top_authors(self, limit: int = 20) -> list[tuple[str, str, int, list[str]]]:
+        """Return (name, affiliation, draft_count, [draft_names])."""
+        rows = self.conn.execute(
+            """SELECT a.name, a.affiliation, COUNT(da.draft_name) as cnt,
+                GROUP_CONCAT(da.draft_name, '||') as drafts
+            FROM authors a
+            JOIN draft_authors da ON a.person_id = da.person_id
+            GROUP BY a.person_id
+            ORDER BY cnt DESC
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [
+            (r["name"], r["affiliation"], r["cnt"],
+             r["drafts"].split("||") if r["drafts"] else [])
+            for r in rows
+        ]
+
+    def top_orgs(self, limit: int = 20) -> list[tuple[str, int, int]]:
+        """Return (org, author_count, draft_count)."""
+        rows = self.conn.execute(
+            """SELECT da.affiliation as org,
+                COUNT(DISTINCT da.person_id) as authors,
+                COUNT(DISTINCT da.draft_name) as drafts
+            FROM draft_authors da
+            WHERE da.affiliation != ''
+            GROUP BY da.affiliation
+            ORDER BY drafts DESC
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [(r["org"], r["authors"], r["drafts"]) for r in rows]
+
+    def coauthor_pairs(self) -> list[tuple[str, str, int]]:
+        """Return (author_a, author_b, shared_drafts) for all co-author pairs."""
+        rows = self.conn.execute(
+            """SELECT a1.name as a, a2.name as b, COUNT(*) as shared
+            FROM draft_authors da1
+            JOIN draft_authors da2 ON da1.draft_name = da2.draft_name AND da1.person_id < da2.person_id
+            JOIN authors a1 ON da1.person_id = a1.person_id
+            JOIN authors a2 ON da2.person_id = a2.person_id
+            GROUP BY da1.person_id, da2.person_id
+            ORDER BY shared DESC"""
+        ).fetchall()
+        return [(r["a"], r["b"], r["shared"]) for r in rows]
+
+    def cross_org_collaborations(self, limit: int = 20) -> list[tuple[str, str, int]]:
+        """Return (org_a, org_b, shared_drafts) for cross-org collaboration."""
+        rows = self.conn.execute(
+            """SELECT da1.affiliation as org_a, da2.affiliation as org_b,
+                COUNT(DISTINCT da1.draft_name) as shared
+            FROM draft_authors da1
+            JOIN draft_authors da2 ON da1.draft_name = da2.draft_name
+                AND da1.person_id < da2.person_id
+            WHERE da1.affiliation != '' AND da2.affiliation != ''
+                AND da1.affiliation != da2.affiliation
+            GROUP BY da1.affiliation, da2.affiliation
+            ORDER BY shared DESC
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [(r["org_a"], r["org_b"], r["shared"]) for r in rows]
+
+    # --- Ideas ---
+
+    def insert_ideas(self, draft_name: str, ideas: list[dict]) -> None:
+        # Clear existing ideas for this draft first
+        self.conn.execute("DELETE FROM ideas WHERE draft_name = ?", (draft_name,))
+        now = datetime.now(timezone.utc).isoformat()
+        for idea in ideas:
+            self.conn.execute(
+                """INSERT INTO ideas (draft_name, title, description, idea_type, extracted_at)
+                VALUES (?, ?, ?, ?, ?)""",
+                (draft_name, idea["title"], idea["description"],
+                 idea.get("type", ""), now),
+            )
+        self.conn.commit()
+
+    def get_ideas_for_draft(self, draft_name: str) -> list[dict]:
+        rows = self.conn.execute(
+            "SELECT * FROM ideas WHERE draft_name = ?", (draft_name,)
+        ).fetchall()
+        return [{"title": r["title"], "description": r["description"],
+                 "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows]
+
+    def drafts_without_ideas(self, limit: int = 500) -> list[str]:
+        rows = self.conn.execute(
+            """SELECT d.name FROM drafts d
+            LEFT JOIN ideas i ON d.name = i.draft_name
+            WHERE i.draft_name IS NULL
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [r["name"] for r in rows]
+
+    def all_ideas(self) -> list[dict]:
+        rows = self.conn.execute(
+            "SELECT * FROM ideas ORDER BY draft_name"
+        ).fetchall()
+        return [{"title": r["title"], "description": r["description"],
+                 "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows]
+
+    def idea_count(self) -> int:
+        return self.conn.execute("SELECT COUNT(*) FROM ideas").fetchone()[0]
+
+    # --- Gaps ---
+
+    def insert_gaps(self, gaps: list[dict]) -> None:
+        self.conn.execute("DELETE FROM gaps")  # Replace old analysis
+        now = datetime.now(timezone.utc).isoformat()
+        for g in gaps:
+            self.conn.execute(
+                """INSERT INTO gaps (topic, description, category, evidence, severity, analyzed_at)
+                VALUES (?, ?, ?, ?, ?, ?)""",
+                (g["topic"], g["description"], g.get("category", ""),
+                 g.get("evidence", ""), g.get("severity", "medium"), now),
+            )
+        self.conn.commit()
+
+    def all_gaps(self) -> list[dict]:
+        rows = self.conn.execute("SELECT * FROM gaps ORDER BY id").fetchall()
+        return [{"id": r["id"], "topic": r["topic"], "description": r["description"],
+                 "category": r["category"], "evidence": r["evidence"],
+                 "severity": r["severity"]} for r in rows]
+
    # --- Helpers ---

    @staticmethod