IETF Draft Analyzer v0.1.0 — track, categorize, and rate AI/agent drafts
Python CLI tool that fetches AI/agent-related Internet-Drafts from the IETF Datatracker, rates them using Claude, generates embeddings via Ollama for similarity/clustering, and produces markdown reports. Features: - Fetch drafts by keyword from Datatracker API with full text download - Batch analysis with Claude (token-optimized, responses cached in SQLite) - Embedding-based similarity search and overlap cluster detection - Reports: overview, landscape by category, overlap clusters, weekly digest - SQLite with FTS5 for full-text search across 260 tracked drafts Initial analysis of 260 drafts reveals OAuth agent auth (13 drafts) and agent gateway/collaboration (10 drafts) as the most crowded clusters, while AI safety/alignment is underserved with the highest quality scores. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
375
src/ietf_analyzer/db.py
Normal file
375
src/ietf_analyzer/db.py
Normal file
@@ -0,0 +1,375 @@
|
||||
"""SQLite database layer with FTS5 full-text search."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .config import Config
|
||||
from .models import Draft, Rating
|
||||
|
||||
SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS drafts (
|
||||
name TEXT PRIMARY KEY,
|
||||
rev TEXT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
abstract TEXT NOT NULL DEFAULT '',
|
||||
time TEXT,
|
||||
dt_id INTEGER,
|
||||
pages INTEGER,
|
||||
words INTEGER,
|
||||
"group" TEXT,
|
||||
group_uri TEXT,
|
||||
expires TEXT,
|
||||
ad TEXT,
|
||||
shepherd TEXT,
|
||||
states TEXT DEFAULT '[]', -- JSON array
|
||||
full_text TEXT,
|
||||
categories TEXT DEFAULT '[]', -- JSON array
|
||||
tags TEXT DEFAULT '[]', -- JSON array
|
||||
fetched_at TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ratings (
|
||||
draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
|
||||
novelty INTEGER NOT NULL,
|
||||
maturity INTEGER NOT NULL,
|
||||
overlap INTEGER NOT NULL,
|
||||
momentum INTEGER NOT NULL,
|
||||
relevance INTEGER NOT NULL,
|
||||
summary TEXT NOT NULL DEFAULT '',
|
||||
novelty_note TEXT DEFAULT '',
|
||||
maturity_note TEXT DEFAULT '',
|
||||
overlap_note TEXT DEFAULT '',
|
||||
momentum_note TEXT DEFAULT '',
|
||||
relevance_note TEXT DEFAULT '',
|
||||
categories TEXT DEFAULT '[]', -- JSON array
|
||||
rated_at TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS embeddings (
|
||||
draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
|
||||
model TEXT NOT NULL,
|
||||
vector BLOB NOT NULL, -- numpy float32 array as bytes
|
||||
created_at TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS llm_cache (
|
||||
draft_name TEXT NOT NULL,
|
||||
prompt_hash TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
request_json TEXT NOT NULL, -- full prompt sent
|
||||
response_json TEXT NOT NULL, -- raw Claude response
|
||||
input_tokens INTEGER,
|
||||
output_tokens INTEGER,
|
||||
created_at TEXT,
|
||||
PRIMARY KEY (draft_name, prompt_hash)
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS drafts_fts USING fts5(
|
||||
name, title, abstract, full_text,
|
||||
content='drafts',
|
||||
content_rowid='rowid'
|
||||
);
|
||||
|
||||
-- Triggers to keep FTS index in sync
|
||||
CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
|
||||
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
|
||||
VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS drafts_ad AFTER DELETE ON drafts BEGIN
|
||||
INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
|
||||
VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS drafts_au AFTER UPDATE ON drafts BEGIN
|
||||
INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
|
||||
VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
|
||||
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
|
||||
VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
|
||||
END;
|
||||
"""
|
||||
|
||||
|
||||
class Database:
|
||||
def __init__(self, config: Config | None = None):
|
||||
self.config = config or Config.load()
|
||||
self.db_path = self.config.db_path
|
||||
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
self._conn: sqlite3.Connection | None = None
|
||||
|
||||
@property
|
||||
def conn(self) -> sqlite3.Connection:
|
||||
if self._conn is None:
|
||||
self._conn = sqlite3.connect(self.db_path)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute("PRAGMA journal_mode=WAL")
|
||||
self._conn.execute("PRAGMA foreign_keys=ON")
|
||||
self._conn.executescript(SCHEMA)
|
||||
return self._conn
|
||||
|
||||
def close(self) -> None:
|
||||
if self._conn:
|
||||
self._conn.close()
|
||||
self._conn = None
|
||||
|
||||
# --- Drafts ---
|
||||
|
||||
def upsert_draft(self, draft: Draft) -> None:
|
||||
self.conn.execute(
|
||||
"""INSERT INTO drafts (name, rev, title, abstract, time, dt_id, pages, words,
|
||||
"group", group_uri, expires, ad, shepherd, states, full_text, categories, tags, fetched_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(name) DO UPDATE SET
|
||||
rev=excluded.rev, title=excluded.title, abstract=excluded.abstract,
|
||||
time=excluded.time, dt_id=excluded.dt_id, pages=excluded.pages,
|
||||
words=excluded.words, "group"=excluded."group", group_uri=excluded.group_uri,
|
||||
expires=excluded.expires, ad=excluded.ad, shepherd=excluded.shepherd,
|
||||
states=excluded.states,
|
||||
full_text=COALESCE(excluded.full_text, full_text),
|
||||
categories=excluded.categories, tags=excluded.tags,
|
||||
fetched_at=excluded.fetched_at
|
||||
""",
|
||||
(
|
||||
draft.name, draft.rev, draft.title, draft.abstract, draft.time,
|
||||
draft.dt_id, draft.pages, draft.words, draft.group, draft.group_uri,
|
||||
draft.expires, draft.ad, draft.shepherd,
|
||||
json.dumps(draft.states), draft.full_text,
|
||||
json.dumps(draft.categories), json.dumps(draft.tags),
|
||||
draft.fetched_at or datetime.now(timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def get_draft(self, name: str) -> Draft | None:
|
||||
row = self.conn.execute("SELECT * FROM drafts WHERE name = ?", (name,)).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return self._row_to_draft(row)
|
||||
|
||||
def list_drafts(
|
||||
self,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
order_by: str = "time DESC",
|
||||
) -> list[Draft]:
|
||||
# Sanitize order_by to prevent injection
|
||||
allowed = {"time", "name", "title", "pages", "words", "fetched_at"}
|
||||
parts = order_by.split()
|
||||
col = parts[0] if parts else "time"
|
||||
direction = parts[1].upper() if len(parts) > 1 else "DESC"
|
||||
if col not in allowed:
|
||||
col = "time"
|
||||
if direction not in ("ASC", "DESC"):
|
||||
direction = "DESC"
|
||||
safe_order = f'"{col}" {direction}' if col == "group" else f"{col} {direction}"
|
||||
|
||||
rows = self.conn.execute(
|
||||
f"SELECT * FROM drafts ORDER BY {safe_order} LIMIT ? OFFSET ?",
|
||||
(limit, offset),
|
||||
).fetchall()
|
||||
return [self._row_to_draft(r) for r in rows]
|
||||
|
||||
def count_drafts(self) -> int:
|
||||
return self.conn.execute("SELECT COUNT(*) FROM drafts").fetchone()[0]
|
||||
|
||||
def search_drafts(self, query: str, limit: int = 50) -> list[Draft]:
|
||||
rows = self.conn.execute(
|
||||
"""SELECT d.* FROM drafts d
|
||||
JOIN drafts_fts f ON d.rowid = f.rowid
|
||||
WHERE drafts_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
return [self._row_to_draft(r) for r in rows]
|
||||
|
||||
def drafts_without_text(self, limit: int = 100) -> list[Draft]:
|
||||
rows = self.conn.execute(
|
||||
"SELECT * FROM drafts WHERE full_text IS NULL LIMIT ?", (limit,)
|
||||
).fetchall()
|
||||
return [self._row_to_draft(r) for r in rows]
|
||||
|
||||
# --- Ratings ---
|
||||
|
||||
def upsert_rating(self, rating: Rating) -> None:
|
||||
self.conn.execute(
|
||||
"""INSERT INTO ratings (draft_name, novelty, maturity, overlap, momentum, relevance,
|
||||
summary, novelty_note, maturity_note, overlap_note, momentum_note, relevance_note,
|
||||
categories, rated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(draft_name) DO UPDATE SET
|
||||
novelty=excluded.novelty, maturity=excluded.maturity, overlap=excluded.overlap,
|
||||
momentum=excluded.momentum, relevance=excluded.relevance, summary=excluded.summary,
|
||||
novelty_note=excluded.novelty_note, maturity_note=excluded.maturity_note,
|
||||
overlap_note=excluded.overlap_note, momentum_note=excluded.momentum_note,
|
||||
relevance_note=excluded.relevance_note, categories=excluded.categories,
|
||||
rated_at=excluded.rated_at
|
||||
""",
|
||||
(
|
||||
rating.draft_name, rating.novelty, rating.maturity, rating.overlap,
|
||||
rating.momentum, rating.relevance, rating.summary,
|
||||
rating.novelty_note, rating.maturity_note, rating.overlap_note,
|
||||
rating.momentum_note, rating.relevance_note,
|
||||
json.dumps(rating.categories),
|
||||
rating.rated_at or datetime.now(timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def get_rating(self, draft_name: str) -> Rating | None:
|
||||
row = self.conn.execute(
|
||||
"SELECT * FROM ratings WHERE draft_name = ?", (draft_name,)
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return self._row_to_rating(row)
|
||||
|
||||
def unrated_drafts(self, limit: int = 100) -> list[Draft]:
|
||||
rows = self.conn.execute(
|
||||
"""SELECT d.* FROM drafts d
|
||||
LEFT JOIN ratings r ON d.name = r.draft_name
|
||||
WHERE r.draft_name IS NULL
|
||||
LIMIT ?""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
return [self._row_to_draft(r) for r in rows]
|
||||
|
||||
def drafts_with_ratings(self, limit: int = 200) -> list[tuple[Draft, Rating]]:
|
||||
rows = self.conn.execute(
|
||||
"""SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance,
|
||||
r.summary, r.novelty_note, r.maturity_note, r.overlap_note,
|
||||
r.momentum_note, r.relevance_note, r.categories as r_categories, r.rated_at
|
||||
FROM drafts d
|
||||
JOIN ratings r ON d.name = r.draft_name
|
||||
ORDER BY (r.novelty * 0.30 + r.relevance * 0.25 + r.maturity * 0.20
|
||||
+ r.momentum * 0.15 + (6 - r.overlap) * 0.10) DESC
|
||||
LIMIT ?""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
results = []
|
||||
for r in rows:
|
||||
draft = self._row_to_draft(r)
|
||||
rating = Rating(
|
||||
draft_name=r["draft_name"] if "draft_name" in r.keys() else draft.name,
|
||||
novelty=r["novelty"], maturity=r["maturity"], overlap=r["overlap"],
|
||||
momentum=r["momentum"], relevance=r["relevance"], summary=r["summary"],
|
||||
novelty_note=r["novelty_note"], maturity_note=r["maturity_note"],
|
||||
overlap_note=r["overlap_note"], momentum_note=r["momentum_note"],
|
||||
relevance_note=r["relevance_note"],
|
||||
categories=json.loads(r["r_categories"]) if r["r_categories"] else [],
|
||||
rated_at=r["rated_at"],
|
||||
)
|
||||
results.append((draft, rating))
|
||||
return results
|
||||
|
||||
# --- Embeddings ---
|
||||
|
||||
def store_embedding(self, draft_name: str, model: str, vector: np.ndarray) -> None:
|
||||
self.conn.execute(
|
||||
"""INSERT INTO embeddings (draft_name, model, vector, created_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(draft_name) DO UPDATE SET
|
||||
model=excluded.model, vector=excluded.vector, created_at=excluded.created_at
|
||||
""",
|
||||
(draft_name, model, vector.astype(np.float32).tobytes(),
|
||||
datetime.now(timezone.utc).isoformat()),
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def get_embedding(self, draft_name: str) -> np.ndarray | None:
|
||||
row = self.conn.execute(
|
||||
"SELECT vector FROM embeddings WHERE draft_name = ?", (draft_name,)
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return np.frombuffer(row["vector"], dtype=np.float32)
|
||||
|
||||
def all_embeddings(self) -> dict[str, np.ndarray]:
|
||||
rows = self.conn.execute("SELECT draft_name, vector FROM embeddings").fetchall()
|
||||
return {
|
||||
r["draft_name"]: np.frombuffer(r["vector"], dtype=np.float32)
|
||||
for r in rows
|
||||
}
|
||||
|
||||
def drafts_without_embeddings(self, limit: int = 500) -> list[str]:
|
||||
rows = self.conn.execute(
|
||||
"""SELECT d.name FROM drafts d
|
||||
LEFT JOIN embeddings e ON d.name = e.draft_name
|
||||
WHERE e.draft_name IS NULL
|
||||
LIMIT ?""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
return [r["name"] for r in rows]
|
||||
|
||||
# --- LLM Cache ---
|
||||
|
||||
def cache_response(
|
||||
self, draft_name: str, prompt_hash: str, model: str,
|
||||
request_json: str, response_json: str,
|
||||
input_tokens: int = 0, output_tokens: int = 0,
|
||||
) -> None:
|
||||
self.conn.execute(
|
||||
"""INSERT INTO llm_cache (draft_name, prompt_hash, model, request_json,
|
||||
response_json, input_tokens, output_tokens, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(draft_name, prompt_hash) DO UPDATE SET
|
||||
model=excluded.model, response_json=excluded.response_json,
|
||||
input_tokens=excluded.input_tokens, output_tokens=excluded.output_tokens,
|
||||
created_at=excluded.created_at
|
||||
""",
|
||||
(draft_name, prompt_hash, model, request_json, response_json,
|
||||
input_tokens, output_tokens, datetime.now(timezone.utc).isoformat()),
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def get_cached_response(self, draft_name: str, prompt_hash: str) -> str | None:
|
||||
row = self.conn.execute(
|
||||
"SELECT response_json FROM llm_cache WHERE draft_name = ? AND prompt_hash = ?",
|
||||
(draft_name, prompt_hash),
|
||||
).fetchone()
|
||||
return row["response_json"] if row else None
|
||||
|
||||
def total_tokens_used(self) -> tuple[int, int]:
|
||||
row = self.conn.execute(
|
||||
"SELECT COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0) FROM llm_cache"
|
||||
).fetchone()
|
||||
return (row[0], row[1])
|
||||
|
||||
# --- Helpers ---
|
||||
|
||||
@staticmethod
|
||||
def _row_to_draft(row: sqlite3.Row) -> Draft:
|
||||
d = dict(row)
|
||||
return Draft(
|
||||
name=d["name"], rev=d["rev"], title=d["title"], abstract=d["abstract"],
|
||||
time=d["time"], dt_id=d.get("dt_id"), pages=d.get("pages"),
|
||||
words=d.get("words"), group=d.get("group"), group_uri=d.get("group_uri"),
|
||||
expires=d.get("expires"), ad=d.get("ad"), shepherd=d.get("shepherd"),
|
||||
states=json.loads(d.get("states") or "[]"),
|
||||
full_text=d.get("full_text"),
|
||||
categories=json.loads(d.get("categories") or "[]"),
|
||||
tags=json.loads(d.get("tags") or "[]"),
|
||||
fetched_at=d.get("fetched_at"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _row_to_rating(row: sqlite3.Row) -> Rating:
|
||||
d = dict(row)
|
||||
return Rating(
|
||||
draft_name=d["draft_name"], novelty=d["novelty"], maturity=d["maturity"],
|
||||
overlap=d["overlap"], momentum=d["momentum"], relevance=d["relevance"],
|
||||
summary=d["summary"],
|
||||
novelty_note=d.get("novelty_note", ""),
|
||||
maturity_note=d.get("maturity_note", ""),
|
||||
overlap_note=d.get("overlap_note", ""),
|
||||
momentum_note=d.get("momentum_note", ""),
|
||||
relevance_note=d.get("relevance_note", ""),
|
||||
categories=json.loads(d.get("categories") or "[]"),
|
||||
rated_at=d.get("rated_at"),
|
||||
)
|
||||
Reference in New Issue
Block a user