IETF Draft Analyzer v0.1.0 — track, categorize, and rate AI/agent drafts

Python CLI tool that fetches AI/agent-related Internet-Drafts from the IETF
Datatracker, rates them using Claude, generates embeddings via Ollama for
similarity/clustering, and produces markdown reports.

Features:
- Fetch drafts by keyword from Datatracker API with full text download
- Batch analysis with Claude (token-optimized, responses cached in SQLite)
- Embedding-based similarity search and overlap cluster detection
- Reports: overview, landscape by category, overlap clusters, weekly digest
- SQLite with FTS5 for full-text search across 260 tracked drafts

Initial analysis of 260 drafts reveals OAuth agent auth (13 drafts) and
agent gateway/collaboration (10 drafts) as the most crowded clusters,
while AI safety/alignment is underserved with the highest quality scores.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-28 00:36:45 +01:00
commit 6771a4c235
17 changed files with 2823 additions and 0 deletions

375
src/ietf_analyzer/db.py Normal file
View File

@@ -0,0 +1,375 @@
"""SQLite database layer with FTS5 full-text search."""
from __future__ import annotations
import json
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
import numpy as np
from .config import Config
from .models import Draft, Rating
SCHEMA = """
CREATE TABLE IF NOT EXISTS drafts (
name TEXT PRIMARY KEY,
rev TEXT NOT NULL,
title TEXT NOT NULL,
abstract TEXT NOT NULL DEFAULT '',
time TEXT,
dt_id INTEGER,
pages INTEGER,
words INTEGER,
"group" TEXT,
group_uri TEXT,
expires TEXT,
ad TEXT,
shepherd TEXT,
states TEXT DEFAULT '[]', -- JSON array
full_text TEXT,
categories TEXT DEFAULT '[]', -- JSON array
tags TEXT DEFAULT '[]', -- JSON array
fetched_at TEXT
);
CREATE TABLE IF NOT EXISTS ratings (
draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
novelty INTEGER NOT NULL,
maturity INTEGER NOT NULL,
overlap INTEGER NOT NULL,
momentum INTEGER NOT NULL,
relevance INTEGER NOT NULL,
summary TEXT NOT NULL DEFAULT '',
novelty_note TEXT DEFAULT '',
maturity_note TEXT DEFAULT '',
overlap_note TEXT DEFAULT '',
momentum_note TEXT DEFAULT '',
relevance_note TEXT DEFAULT '',
categories TEXT DEFAULT '[]', -- JSON array
rated_at TEXT
);
CREATE TABLE IF NOT EXISTS embeddings (
draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
model TEXT NOT NULL,
vector BLOB NOT NULL, -- numpy float32 array as bytes
created_at TEXT
);
CREATE TABLE IF NOT EXISTS llm_cache (
draft_name TEXT NOT NULL,
prompt_hash TEXT NOT NULL,
model TEXT NOT NULL,
request_json TEXT NOT NULL, -- full prompt sent
response_json TEXT NOT NULL, -- raw Claude response
input_tokens INTEGER,
output_tokens INTEGER,
created_at TEXT,
PRIMARY KEY (draft_name, prompt_hash)
);
CREATE VIRTUAL TABLE IF NOT EXISTS drafts_fts USING fts5(
name, title, abstract, full_text,
content='drafts',
content_rowid='rowid'
);
-- Triggers to keep FTS index in sync
CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
END;
CREATE TRIGGER IF NOT EXISTS drafts_ad AFTER DELETE ON drafts BEGIN
INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
END;
CREATE TRIGGER IF NOT EXISTS drafts_au AFTER UPDATE ON drafts BEGIN
INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
END;
"""
class Database:
def __init__(self, config: Config | None = None):
self.config = config or Config.load()
self.db_path = self.config.db_path
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
self._conn: sqlite3.Connection | None = None
@property
def conn(self) -> sqlite3.Connection:
if self._conn is None:
self._conn = sqlite3.connect(self.db_path)
self._conn.row_factory = sqlite3.Row
self._conn.execute("PRAGMA journal_mode=WAL")
self._conn.execute("PRAGMA foreign_keys=ON")
self._conn.executescript(SCHEMA)
return self._conn
def close(self) -> None:
if self._conn:
self._conn.close()
self._conn = None
# --- Drafts ---
def upsert_draft(self, draft: Draft) -> None:
self.conn.execute(
"""INSERT INTO drafts (name, rev, title, abstract, time, dt_id, pages, words,
"group", group_uri, expires, ad, shepherd, states, full_text, categories, tags, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(name) DO UPDATE SET
rev=excluded.rev, title=excluded.title, abstract=excluded.abstract,
time=excluded.time, dt_id=excluded.dt_id, pages=excluded.pages,
words=excluded.words, "group"=excluded."group", group_uri=excluded.group_uri,
expires=excluded.expires, ad=excluded.ad, shepherd=excluded.shepherd,
states=excluded.states,
full_text=COALESCE(excluded.full_text, full_text),
categories=excluded.categories, tags=excluded.tags,
fetched_at=excluded.fetched_at
""",
(
draft.name, draft.rev, draft.title, draft.abstract, draft.time,
draft.dt_id, draft.pages, draft.words, draft.group, draft.group_uri,
draft.expires, draft.ad, draft.shepherd,
json.dumps(draft.states), draft.full_text,
json.dumps(draft.categories), json.dumps(draft.tags),
draft.fetched_at or datetime.now(timezone.utc).isoformat(),
),
)
self.conn.commit()
def get_draft(self, name: str) -> Draft | None:
row = self.conn.execute("SELECT * FROM drafts WHERE name = ?", (name,)).fetchone()
if row is None:
return None
return self._row_to_draft(row)
def list_drafts(
self,
limit: int = 100,
offset: int = 0,
order_by: str = "time DESC",
) -> list[Draft]:
# Sanitize order_by to prevent injection
allowed = {"time", "name", "title", "pages", "words", "fetched_at"}
parts = order_by.split()
col = parts[0] if parts else "time"
direction = parts[1].upper() if len(parts) > 1 else "DESC"
if col not in allowed:
col = "time"
if direction not in ("ASC", "DESC"):
direction = "DESC"
safe_order = f'"{col}" {direction}' if col == "group" else f"{col} {direction}"
rows = self.conn.execute(
f"SELECT * FROM drafts ORDER BY {safe_order} LIMIT ? OFFSET ?",
(limit, offset),
).fetchall()
return [self._row_to_draft(r) for r in rows]
def count_drafts(self) -> int:
return self.conn.execute("SELECT COUNT(*) FROM drafts").fetchone()[0]
def search_drafts(self, query: str, limit: int = 50) -> list[Draft]:
rows = self.conn.execute(
"""SELECT d.* FROM drafts d
JOIN drafts_fts f ON d.rowid = f.rowid
WHERE drafts_fts MATCH ?
ORDER BY rank
LIMIT ?""",
(query, limit),
).fetchall()
return [self._row_to_draft(r) for r in rows]
def drafts_without_text(self, limit: int = 100) -> list[Draft]:
rows = self.conn.execute(
"SELECT * FROM drafts WHERE full_text IS NULL LIMIT ?", (limit,)
).fetchall()
return [self._row_to_draft(r) for r in rows]
# --- Ratings ---
def upsert_rating(self, rating: Rating) -> None:
self.conn.execute(
"""INSERT INTO ratings (draft_name, novelty, maturity, overlap, momentum, relevance,
summary, novelty_note, maturity_note, overlap_note, momentum_note, relevance_note,
categories, rated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(draft_name) DO UPDATE SET
novelty=excluded.novelty, maturity=excluded.maturity, overlap=excluded.overlap,
momentum=excluded.momentum, relevance=excluded.relevance, summary=excluded.summary,
novelty_note=excluded.novelty_note, maturity_note=excluded.maturity_note,
overlap_note=excluded.overlap_note, momentum_note=excluded.momentum_note,
relevance_note=excluded.relevance_note, categories=excluded.categories,
rated_at=excluded.rated_at
""",
(
rating.draft_name, rating.novelty, rating.maturity, rating.overlap,
rating.momentum, rating.relevance, rating.summary,
rating.novelty_note, rating.maturity_note, rating.overlap_note,
rating.momentum_note, rating.relevance_note,
json.dumps(rating.categories),
rating.rated_at or datetime.now(timezone.utc).isoformat(),
),
)
self.conn.commit()
def get_rating(self, draft_name: str) -> Rating | None:
row = self.conn.execute(
"SELECT * FROM ratings WHERE draft_name = ?", (draft_name,)
).fetchone()
if row is None:
return None
return self._row_to_rating(row)
def unrated_drafts(self, limit: int = 100) -> list[Draft]:
rows = self.conn.execute(
"""SELECT d.* FROM drafts d
LEFT JOIN ratings r ON d.name = r.draft_name
WHERE r.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
return [self._row_to_draft(r) for r in rows]
def drafts_with_ratings(self, limit: int = 200) -> list[tuple[Draft, Rating]]:
rows = self.conn.execute(
"""SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance,
r.summary, r.novelty_note, r.maturity_note, r.overlap_note,
r.momentum_note, r.relevance_note, r.categories as r_categories, r.rated_at
FROM drafts d
JOIN ratings r ON d.name = r.draft_name
ORDER BY (r.novelty * 0.30 + r.relevance * 0.25 + r.maturity * 0.20
+ r.momentum * 0.15 + (6 - r.overlap) * 0.10) DESC
LIMIT ?""",
(limit,),
).fetchall()
results = []
for r in rows:
draft = self._row_to_draft(r)
rating = Rating(
draft_name=r["draft_name"] if "draft_name" in r.keys() else draft.name,
novelty=r["novelty"], maturity=r["maturity"], overlap=r["overlap"],
momentum=r["momentum"], relevance=r["relevance"], summary=r["summary"],
novelty_note=r["novelty_note"], maturity_note=r["maturity_note"],
overlap_note=r["overlap_note"], momentum_note=r["momentum_note"],
relevance_note=r["relevance_note"],
categories=json.loads(r["r_categories"]) if r["r_categories"] else [],
rated_at=r["rated_at"],
)
results.append((draft, rating))
return results
# --- Embeddings ---
def store_embedding(self, draft_name: str, model: str, vector: np.ndarray) -> None:
self.conn.execute(
"""INSERT INTO embeddings (draft_name, model, vector, created_at)
VALUES (?, ?, ?, ?)
ON CONFLICT(draft_name) DO UPDATE SET
model=excluded.model, vector=excluded.vector, created_at=excluded.created_at
""",
(draft_name, model, vector.astype(np.float32).tobytes(),
datetime.now(timezone.utc).isoformat()),
)
self.conn.commit()
def get_embedding(self, draft_name: str) -> np.ndarray | None:
row = self.conn.execute(
"SELECT vector FROM embeddings WHERE draft_name = ?", (draft_name,)
).fetchone()
if row is None:
return None
return np.frombuffer(row["vector"], dtype=np.float32)
def all_embeddings(self) -> dict[str, np.ndarray]:
rows = self.conn.execute("SELECT draft_name, vector FROM embeddings").fetchall()
return {
r["draft_name"]: np.frombuffer(r["vector"], dtype=np.float32)
for r in rows
}
def drafts_without_embeddings(self, limit: int = 500) -> list[str]:
rows = self.conn.execute(
"""SELECT d.name FROM drafts d
LEFT JOIN embeddings e ON d.name = e.draft_name
WHERE e.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
return [r["name"] for r in rows]
# --- LLM Cache ---
def cache_response(
self, draft_name: str, prompt_hash: str, model: str,
request_json: str, response_json: str,
input_tokens: int = 0, output_tokens: int = 0,
) -> None:
self.conn.execute(
"""INSERT INTO llm_cache (draft_name, prompt_hash, model, request_json,
response_json, input_tokens, output_tokens, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(draft_name, prompt_hash) DO UPDATE SET
model=excluded.model, response_json=excluded.response_json,
input_tokens=excluded.input_tokens, output_tokens=excluded.output_tokens,
created_at=excluded.created_at
""",
(draft_name, prompt_hash, model, request_json, response_json,
input_tokens, output_tokens, datetime.now(timezone.utc).isoformat()),
)
self.conn.commit()
def get_cached_response(self, draft_name: str, prompt_hash: str) -> str | None:
row = self.conn.execute(
"SELECT response_json FROM llm_cache WHERE draft_name = ? AND prompt_hash = ?",
(draft_name, prompt_hash),
).fetchone()
return row["response_json"] if row else None
def total_tokens_used(self) -> tuple[int, int]:
row = self.conn.execute(
"SELECT COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0) FROM llm_cache"
).fetchone()
return (row[0], row[1])
# --- Helpers ---
@staticmethod
def _row_to_draft(row: sqlite3.Row) -> Draft:
d = dict(row)
return Draft(
name=d["name"], rev=d["rev"], title=d["title"], abstract=d["abstract"],
time=d["time"], dt_id=d.get("dt_id"), pages=d.get("pages"),
words=d.get("words"), group=d.get("group"), group_uri=d.get("group_uri"),
expires=d.get("expires"), ad=d.get("ad"), shepherd=d.get("shepherd"),
states=json.loads(d.get("states") or "[]"),
full_text=d.get("full_text"),
categories=json.loads(d.get("categories") or "[]"),
tags=json.loads(d.get("tags") or "[]"),
fetched_at=d.get("fetched_at"),
)
@staticmethod
def _row_to_rating(row: sqlite3.Row) -> Rating:
d = dict(row)
return Rating(
draft_name=d["draft_name"], novelty=d["novelty"], maturity=d["maturity"],
overlap=d["overlap"], momentum=d["momentum"], relevance=d["relevance"],
summary=d["summary"],
novelty_note=d.get("novelty_note", ""),
maturity_note=d.get("maturity_note", ""),
overlap_note=d.get("overlap_note", ""),
momentum_note=d.get("momentum_note", ""),
relevance_note=d.get("relevance_note", ""),
categories=json.loads(d.get("categories") or "[]"),
rated_at=d.get("rated_at"),
)