IETF Draft Analyzer v0.1.0 — track, categorize, and rate AI/agent drafts

Python CLI tool that fetches AI/agent-related Internet-Drafts from the IETF Datatracker, rates them using Claude, generates embeddings via Ollama for similarity/clustering, and produces markdown reports. Features: - Fetch drafts by keyword from Datatracker API with full text download - Batch analysis with Claude (token-optimized, responses cached in SQLite) - Embedding-based similarity search and overlap cluster detection - Reports: overview, landscape by category, overlap clusters, weekly digest - SQLite with FTS5 for full-text search across 260 tracked drafts Initial analysis of 260 drafts reveals OAuth agent auth (13 drafts) and agent gateway/collaboration (10 drafts) as the most crowded clusters, while AI safety/alignment is underserved with the highest quality scores. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 00:36:45 +01:00
commit 6771a4c235
17 changed files with 2823 additions and 0 deletions
--- a/src/ietf_analyzer/db.py
+++ b/src/ietf_analyzer/db.py
@@ -0,0 +1,375 @@
+"""SQLite database layer with FTS5 full-text search."""
+
+from __future__ import annotations
+
+import json
+import sqlite3
+from datetime import datetime, timezone
+from pathlib import Path
+
+import numpy as np
+
+from .config import Config
+from .models import Draft, Rating
+
+SCHEMA = """
+CREATE TABLE IF NOT EXISTS drafts (
+    name TEXT PRIMARY KEY,
+    rev TEXT NOT NULL,
+    title TEXT NOT NULL,
+    abstract TEXT NOT NULL DEFAULT '',
+    time TEXT,
+    dt_id INTEGER,
+    pages INTEGER,
+    words INTEGER,
+    "group" TEXT,
+    group_uri TEXT,
+    expires TEXT,
+    ad TEXT,
+    shepherd TEXT,
+    states TEXT DEFAULT '[]',       -- JSON array
+    full_text TEXT,
+    categories TEXT DEFAULT '[]',   -- JSON array
+    tags TEXT DEFAULT '[]',         -- JSON array
+    fetched_at TEXT
+);
+
+CREATE TABLE IF NOT EXISTS ratings (
+    draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
+    novelty INTEGER NOT NULL,
+    maturity INTEGER NOT NULL,
+    overlap INTEGER NOT NULL,
+    momentum INTEGER NOT NULL,
+    relevance INTEGER NOT NULL,
+    summary TEXT NOT NULL DEFAULT '',
+    novelty_note TEXT DEFAULT '',
+    maturity_note TEXT DEFAULT '',
+    overlap_note TEXT DEFAULT '',
+    momentum_note TEXT DEFAULT '',
+    relevance_note TEXT DEFAULT '',
+    categories TEXT DEFAULT '[]',   -- JSON array
+    rated_at TEXT
+);
+
+CREATE TABLE IF NOT EXISTS embeddings (
+    draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
+    model TEXT NOT NULL,
+    vector BLOB NOT NULL,           -- numpy float32 array as bytes
+    created_at TEXT
+);
+
+CREATE TABLE IF NOT EXISTS llm_cache (
+    draft_name TEXT NOT NULL,
+    prompt_hash TEXT NOT NULL,
+    model TEXT NOT NULL,
+    request_json TEXT NOT NULL,      -- full prompt sent
+    response_json TEXT NOT NULL,     -- raw Claude response
+    input_tokens INTEGER,
+    output_tokens INTEGER,
+    created_at TEXT,
+    PRIMARY KEY (draft_name, prompt_hash)
+);
+
+CREATE VIRTUAL TABLE IF NOT EXISTS drafts_fts USING fts5(
+    name, title, abstract, full_text,
+    content='drafts',
+    content_rowid='rowid'
+);
+
+-- Triggers to keep FTS index in sync
+CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
+    INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
+    VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
+END;
+
+CREATE TRIGGER IF NOT EXISTS drafts_ad AFTER DELETE ON drafts BEGIN
+    INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
+    VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
+END;
+
+CREATE TRIGGER IF NOT EXISTS drafts_au AFTER UPDATE ON drafts BEGIN
+    INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
+    VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
+    INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
+    VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
+END;
+"""
+
+
+class Database:
+    def __init__(self, config: Config | None = None):
+        self.config = config or Config.load()
+        self.db_path = self.config.db_path
+        Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
+        self._conn: sqlite3.Connection | None = None
+
+    @property
+    def conn(self) -> sqlite3.Connection:
+        if self._conn is None:
+            self._conn = sqlite3.connect(self.db_path)
+            self._conn.row_factory = sqlite3.Row
+            self._conn.execute("PRAGMA journal_mode=WAL")
+            self._conn.execute("PRAGMA foreign_keys=ON")
+            self._conn.executescript(SCHEMA)
+        return self._conn
+
+    def close(self) -> None:
+        if self._conn:
+            self._conn.close()
+            self._conn = None
+
+    # --- Drafts ---
+
+    def upsert_draft(self, draft: Draft) -> None:
+        self.conn.execute(
+            """INSERT INTO drafts (name, rev, title, abstract, time, dt_id, pages, words,
+                "group", group_uri, expires, ad, shepherd, states, full_text, categories, tags, fetched_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(name) DO UPDATE SET
+                rev=excluded.rev, title=excluded.title, abstract=excluded.abstract,
+                time=excluded.time, dt_id=excluded.dt_id, pages=excluded.pages,
+                words=excluded.words, "group"=excluded."group", group_uri=excluded.group_uri,
+                expires=excluded.expires, ad=excluded.ad, shepherd=excluded.shepherd,
+                states=excluded.states,
+                full_text=COALESCE(excluded.full_text, full_text),
+                categories=excluded.categories, tags=excluded.tags,
+                fetched_at=excluded.fetched_at
+            """,
+            (
+                draft.name, draft.rev, draft.title, draft.abstract, draft.time,
+                draft.dt_id, draft.pages, draft.words, draft.group, draft.group_uri,
+                draft.expires, draft.ad, draft.shepherd,
+                json.dumps(draft.states), draft.full_text,
+                json.dumps(draft.categories), json.dumps(draft.tags),
+                draft.fetched_at or datetime.now(timezone.utc).isoformat(),
+            ),
+        )
+        self.conn.commit()
+
+    def get_draft(self, name: str) -> Draft | None:
+        row = self.conn.execute("SELECT * FROM drafts WHERE name = ?", (name,)).fetchone()
+        if row is None:
+            return None
+        return self._row_to_draft(row)
+
+    def list_drafts(
+        self,
+        limit: int = 100,
+        offset: int = 0,
+        order_by: str = "time DESC",
+    ) -> list[Draft]:
+        # Sanitize order_by to prevent injection
+        allowed = {"time", "name", "title", "pages", "words", "fetched_at"}
+        parts = order_by.split()
+        col = parts[0] if parts else "time"
+        direction = parts[1].upper() if len(parts) > 1 else "DESC"
+        if col not in allowed:
+            col = "time"
+        if direction not in ("ASC", "DESC"):
+            direction = "DESC"
+        safe_order = f'"{col}" {direction}' if col == "group" else f"{col} {direction}"
+
+        rows = self.conn.execute(
+            f"SELECT * FROM drafts ORDER BY {safe_order} LIMIT ? OFFSET ?",
+            (limit, offset),
+        ).fetchall()
+        return [self._row_to_draft(r) for r in rows]
+
+    def count_drafts(self) -> int:
+        return self.conn.execute("SELECT COUNT(*) FROM drafts").fetchone()[0]
+
+    def search_drafts(self, query: str, limit: int = 50) -> list[Draft]:
+        rows = self.conn.execute(
+            """SELECT d.* FROM drafts d
+            JOIN drafts_fts f ON d.rowid = f.rowid
+            WHERE drafts_fts MATCH ?
+            ORDER BY rank
+            LIMIT ?""",
+            (query, limit),
+        ).fetchall()
+        return [self._row_to_draft(r) for r in rows]
+
+    def drafts_without_text(self, limit: int = 100) -> list[Draft]:
+        rows = self.conn.execute(
+            "SELECT * FROM drafts WHERE full_text IS NULL LIMIT ?", (limit,)
+        ).fetchall()
+        return [self._row_to_draft(r) for r in rows]
+
+    # --- Ratings ---
+
+    def upsert_rating(self, rating: Rating) -> None:
+        self.conn.execute(
+            """INSERT INTO ratings (draft_name, novelty, maturity, overlap, momentum, relevance,
+                summary, novelty_note, maturity_note, overlap_note, momentum_note, relevance_note,
+                categories, rated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(draft_name) DO UPDATE SET
+                novelty=excluded.novelty, maturity=excluded.maturity, overlap=excluded.overlap,
+                momentum=excluded.momentum, relevance=excluded.relevance, summary=excluded.summary,
+                novelty_note=excluded.novelty_note, maturity_note=excluded.maturity_note,
+                overlap_note=excluded.overlap_note, momentum_note=excluded.momentum_note,
+                relevance_note=excluded.relevance_note, categories=excluded.categories,
+                rated_at=excluded.rated_at
+            """,
+            (
+                rating.draft_name, rating.novelty, rating.maturity, rating.overlap,
+                rating.momentum, rating.relevance, rating.summary,
+                rating.novelty_note, rating.maturity_note, rating.overlap_note,
+                rating.momentum_note, rating.relevance_note,
+                json.dumps(rating.categories),
+                rating.rated_at or datetime.now(timezone.utc).isoformat(),
+            ),
+        )
+        self.conn.commit()
+
+    def get_rating(self, draft_name: str) -> Rating | None:
+        row = self.conn.execute(
+            "SELECT * FROM ratings WHERE draft_name = ?", (draft_name,)
+        ).fetchone()
+        if row is None:
+            return None
+        return self._row_to_rating(row)
+
+    def unrated_drafts(self, limit: int = 100) -> list[Draft]:
+        rows = self.conn.execute(
+            """SELECT d.* FROM drafts d
+            LEFT JOIN ratings r ON d.name = r.draft_name
+            WHERE r.draft_name IS NULL
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [self._row_to_draft(r) for r in rows]
+
+    def drafts_with_ratings(self, limit: int = 200) -> list[tuple[Draft, Rating]]:
+        rows = self.conn.execute(
+            """SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance,
+                r.summary, r.novelty_note, r.maturity_note, r.overlap_note,
+                r.momentum_note, r.relevance_note, r.categories as r_categories, r.rated_at
+            FROM drafts d
+            JOIN ratings r ON d.name = r.draft_name
+            ORDER BY (r.novelty * 0.30 + r.relevance * 0.25 + r.maturity * 0.20
+                      + r.momentum * 0.15 + (6 - r.overlap) * 0.10) DESC
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        results = []
+        for r in rows:
+            draft = self._row_to_draft(r)
+            rating = Rating(
+                draft_name=r["draft_name"] if "draft_name" in r.keys() else draft.name,
+                novelty=r["novelty"], maturity=r["maturity"], overlap=r["overlap"],
+                momentum=r["momentum"], relevance=r["relevance"], summary=r["summary"],
+                novelty_note=r["novelty_note"], maturity_note=r["maturity_note"],
+                overlap_note=r["overlap_note"], momentum_note=r["momentum_note"],
+                relevance_note=r["relevance_note"],
+                categories=json.loads(r["r_categories"]) if r["r_categories"] else [],
+                rated_at=r["rated_at"],
+            )
+            results.append((draft, rating))
+        return results
+
+    # --- Embeddings ---
+
+    def store_embedding(self, draft_name: str, model: str, vector: np.ndarray) -> None:
+        self.conn.execute(
+            """INSERT INTO embeddings (draft_name, model, vector, created_at)
+            VALUES (?, ?, ?, ?)
+            ON CONFLICT(draft_name) DO UPDATE SET
+                model=excluded.model, vector=excluded.vector, created_at=excluded.created_at
+            """,
+            (draft_name, model, vector.astype(np.float32).tobytes(),
+             datetime.now(timezone.utc).isoformat()),
+        )
+        self.conn.commit()
+
+    def get_embedding(self, draft_name: str) -> np.ndarray | None:
+        row = self.conn.execute(
+            "SELECT vector FROM embeddings WHERE draft_name = ?", (draft_name,)
+        ).fetchone()
+        if row is None:
+            return None
+        return np.frombuffer(row["vector"], dtype=np.float32)
+
+    def all_embeddings(self) -> dict[str, np.ndarray]:
+        rows = self.conn.execute("SELECT draft_name, vector FROM embeddings").fetchall()
+        return {
+            r["draft_name"]: np.frombuffer(r["vector"], dtype=np.float32)
+            for r in rows
+        }
+
+    def drafts_without_embeddings(self, limit: int = 500) -> list[str]:
+        rows = self.conn.execute(
+            """SELECT d.name FROM drafts d
+            LEFT JOIN embeddings e ON d.name = e.draft_name
+            WHERE e.draft_name IS NULL
+            LIMIT ?""",
+            (limit,),
+        ).fetchall()
+        return [r["name"] for r in rows]
+
+    # --- LLM Cache ---
+
+    def cache_response(
+        self, draft_name: str, prompt_hash: str, model: str,
+        request_json: str, response_json: str,
+        input_tokens: int = 0, output_tokens: int = 0,
+    ) -> None:
+        self.conn.execute(
+            """INSERT INTO llm_cache (draft_name, prompt_hash, model, request_json,
+                response_json, input_tokens, output_tokens, created_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(draft_name, prompt_hash) DO UPDATE SET
+                model=excluded.model, response_json=excluded.response_json,
+                input_tokens=excluded.input_tokens, output_tokens=excluded.output_tokens,
+                created_at=excluded.created_at
+            """,
+            (draft_name, prompt_hash, model, request_json, response_json,
+             input_tokens, output_tokens, datetime.now(timezone.utc).isoformat()),
+        )
+        self.conn.commit()
+
+    def get_cached_response(self, draft_name: str, prompt_hash: str) -> str | None:
+        row = self.conn.execute(
+            "SELECT response_json FROM llm_cache WHERE draft_name = ? AND prompt_hash = ?",
+            (draft_name, prompt_hash),
+        ).fetchone()
+        return row["response_json"] if row else None
+
+    def total_tokens_used(self) -> tuple[int, int]:
+        row = self.conn.execute(
+            "SELECT COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0) FROM llm_cache"
+        ).fetchone()
+        return (row[0], row[1])
+
+    # --- Helpers ---
+
+    @staticmethod
+    def _row_to_draft(row: sqlite3.Row) -> Draft:
+        d = dict(row)
+        return Draft(
+            name=d["name"], rev=d["rev"], title=d["title"], abstract=d["abstract"],
+            time=d["time"], dt_id=d.get("dt_id"), pages=d.get("pages"),
+            words=d.get("words"), group=d.get("group"), group_uri=d.get("group_uri"),
+            expires=d.get("expires"), ad=d.get("ad"), shepherd=d.get("shepherd"),
+            states=json.loads(d.get("states") or "[]"),
+            full_text=d.get("full_text"),
+            categories=json.loads(d.get("categories") or "[]"),
+            tags=json.loads(d.get("tags") or "[]"),
+            fetched_at=d.get("fetched_at"),
+        )
+
+    @staticmethod
+    def _row_to_rating(row: sqlite3.Row) -> Rating:
+        d = dict(row)
+        return Rating(
+            draft_name=d["draft_name"], novelty=d["novelty"], maturity=d["maturity"],
+            overlap=d["overlap"], momentum=d["momentum"], relevance=d["relevance"],
+            summary=d["summary"],
+            novelty_note=d.get("novelty_note", ""),
+            maturity_note=d.get("maturity_note", ""),
+            overlap_note=d.get("overlap_note", ""),
+            momentum_note=d.get("momentum_note", ""),
+            relevance_note=d.get("relevance_note", ""),
+            categories=json.loads(d.get("categories") or "[]"),
+            rated_at=d.get("rated_at"),
+        )