IETF Draft Analyzer v0.1.0 — track, categorize, and rate AI/agent drafts

Python CLI tool that fetches AI/agent-related Internet-Drafts from the IETF
Datatracker, rates them using Claude, generates embeddings via Ollama for
similarity/clustering, and produces markdown reports.

Features:
- Fetch drafts by keyword from Datatracker API with full text download
- Batch analysis with Claude (token-optimized, responses cached in SQLite)
- Embedding-based similarity search and overlap cluster detection
- Reports: overview, landscape by category, overlap clusters, weekly digest
- SQLite with FTS5 for full-text search across 260 tracked drafts

Initial analysis of 260 drafts reveals OAuth agent auth (13 drafts) and
agent gateway/collaboration (10 drafts) as the most crowded clusters,
while AI safety/alignment is underserved with the highest quality scores.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-28 00:36:45 +01:00
commit 6771a4c235
17 changed files with 2823 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""IETF Draft Analyzer — Track, categorize, and rate AI/agent-related Internet-Drafts."""

View File

@@ -0,0 +1,276 @@
"""Claude-based analysis — summarization, rating, categorization, overlap detection."""
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone
import anthropic
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn
from .config import Config
from .db import Database
from .models import Draft, Rating
console = Console()
CATEGORIES_SHORT = [
"A2A protocols", # Agent-to-agent communication protocols
"AI safety/alignment", # AI safety / guardrails / alignment
"ML traffic mgmt", # ML-based traffic management / optimization
"Autonomous netops", # Autonomous network operations
"Agent identity/auth", # Identity / authentication for AI agents
"Data formats/interop",# Data formats / semantics for AI interop
"Policy/governance", # Policy / governance / ethical frameworks
"Model serving/inference", # AI model serving / inference protocols
"Agent discovery/reg", # Agent discovery / registration
"Human-agent interaction",
"Other AI/agent",
]
# Compact prompt — abstract only, saves ~10x tokens vs full-text
RATE_PROMPT_COMPACT = """\
Rate this IETF draft. JSON only.
{name} | {title} | {time} | {pages}pg
Abstract: {abstract}
Return JSON: {{"s":"2-3 sentence summary","n":<1-5>,"nn":"novelty note","m":<1-5>,"mn":"maturity note","o":<1-5>,"on":"overlap note","mo":<1-5>,"mon":"momentum note","r":<1-5>,"rn":"relevance note","c":["categories"]}}
Scale: 1=very low..5=very high. Overlap: 1=unique,5=heavy overlap.
Categories: {categories}
JSON only, no fences."""
# Batch prompt — rate multiple drafts in one call
BATCH_PROMPT = """\
Rate each IETF draft below. Return a JSON array with one object per draft, in order.
{drafts_block}
Per-draft JSON: {{"name":"draft-name","s":"2-3 sentence summary","n":<1-5>,"nn":"novelty note","m":<1-5>,"mn":"maturity note","o":<1-5>,"on":"overlap with known drafts","mo":<1-5>,"mon":"momentum note","r":<1-5>,"rn":"relevance note","c":["categories"]}}
Scale: 1=very low..5=very high. Overlap: 1=unique,5=heavy overlap.
Categories: {categories}
Return ONLY a JSON array, no fences."""
COMPARE_PROMPT = """\
Compare these IETF drafts — overlaps, unique ideas, complementary vs competing vs redundant.
{drafts_section}
Be specific about concrete mechanisms and design choices."""
def _prompt_hash(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:16]
class Analyzer:
def __init__(self, config: Config | None = None, db: Database | None = None):
self.config = config or Config.load()
self.db = db or Database(self.config)
try:
self.client = anthropic.Anthropic()
except Exception:
console.print(
"[red bold]No Anthropic API key found.[/]\n"
"Set ANTHROPIC_API_KEY environment variable or run:\n"
" export ANTHROPIC_API_KEY=sk-ant-..."
)
raise SystemExit(1)
def _parse_rating(self, draft_name: str, data: dict) -> Rating:
"""Parse a rating from compact JSON keys."""
return Rating(
draft_name=draft_name,
novelty=int(data.get("n", data.get("novelty", 3))),
maturity=int(data.get("m", data.get("maturity", 3))),
overlap=int(data.get("o", data.get("overlap", 3))),
momentum=int(data.get("mo", data.get("momentum", 3))),
relevance=int(data.get("r", data.get("relevance", 3))),
summary=data.get("s", data.get("summary", "")),
novelty_note=data.get("nn", data.get("novelty_note", "")),
maturity_note=data.get("mn", data.get("maturity_note", "")),
overlap_note=data.get("on", data.get("overlap_note", "")),
momentum_note=data.get("mon", data.get("momentum_note", "")),
relevance_note=data.get("rn", data.get("relevance_note", "")),
categories=data.get("c", data.get("categories", [])),
rated_at=datetime.now(timezone.utc).isoformat(),
)
def _call_claude(self, prompt: str, max_tokens: int = 512) -> tuple[str, int, int]:
"""Call Claude and return (text, input_tokens, output_tokens)."""
resp = self.client.messages.create(
model=self.config.claude_model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}],
)
text = resp.content[0].text.strip()
return text, resp.usage.input_tokens, resp.usage.output_tokens
def _extract_json(self, text: str) -> str:
"""Strip markdown fences if present."""
if text.startswith("```"):
text = text.split("\n", 1)[1]
if text.endswith("```"):
text = text[:-3]
return text.strip()
def rate_draft(self, draft_name: str, use_cache: bool = True) -> Rating | None:
"""Analyze and rate a single draft."""
draft = self.db.get_draft(draft_name)
if draft is None:
console.print(f"[red]Draft not found: {draft_name}[/]")
return None
prompt = RATE_PROMPT_COMPACT.format(
name=draft.name, title=draft.title, time=draft.date,
pages=draft.pages or "?",
abstract=draft.abstract[:2000],
categories=", ".join(CATEGORIES_SHORT),
)
phash = _prompt_hash(prompt)
# Check cache
if use_cache:
cached = self.db.get_cached_response(draft_name, phash)
if cached:
try:
data = json.loads(cached)
rating = self._parse_rating(draft_name, data)
self.db.upsert_rating(rating)
draft.categories = rating.categories
self.db.upsert_draft(draft)
return rating
except (json.JSONDecodeError, KeyError):
pass # Re-analyze if cache is corrupt
try:
text, in_tok, out_tok = self._call_claude(prompt, max_tokens=512)
text = self._extract_json(text)
data = json.loads(text)
# Cache the raw response
self.db.cache_response(
draft_name, phash, self.config.claude_model,
prompt, text, in_tok, out_tok,
)
except (json.JSONDecodeError, anthropic.APIError, IndexError, KeyError) as e:
console.print(f"[red]Failed {draft_name}: {e}[/]")
return None
rating = self._parse_rating(draft_name, data)
self.db.upsert_rating(rating)
draft.categories = rating.categories
self.db.upsert_draft(draft)
return rating
def rate_batch(self, drafts: list[Draft], batch_size: int = 5) -> int:
"""Rate multiple drafts in batched API calls to save tokens."""
count = 0
for i in range(0, len(drafts), batch_size):
batch = drafts[i:i + batch_size]
# Build batch prompt
drafts_block = ""
for d in batch:
drafts_block += f"\n---\n{d.name} | {d.title} | {d.date} | {d.pages or '?'}pg\nAbstract: {d.abstract[:1500]}\n"
prompt = BATCH_PROMPT.format(
drafts_block=drafts_block,
categories=", ".join(CATEGORIES_SHORT),
)
phash = _prompt_hash(prompt)
try:
text, in_tok, out_tok = self._call_claude(
prompt, max_tokens=400 * len(batch)
)
text = self._extract_json(text)
results = json.loads(text)
if not isinstance(results, list):
results = [results]
for j, data in enumerate(results):
draft_name = data.get("name", batch[j].name if j < len(batch) else None)
if not draft_name:
continue
# Cache each result individually
self.db.cache_response(
draft_name, _prompt_hash(f"batch-{phash}-{draft_name}"),
self.config.claude_model, f"batch[{i}]", json.dumps(data),
in_tok // len(results), out_tok // len(results),
)
rating = self._parse_rating(draft_name, data)
self.db.upsert_rating(rating)
draft = self.db.get_draft(draft_name)
if draft:
draft.categories = rating.categories
self.db.upsert_draft(draft)
count += 1
except (json.JSONDecodeError, anthropic.APIError) as e:
console.print(f"[red]Batch {i//batch_size+1} failed: {e}[/]")
# Fallback: rate individually
for d in batch:
r = self.rate_draft(d.name)
if r:
count += 1
return count
def rate_all_unrated(self, limit: int = 300, batch_size: int = 5) -> int:
"""Rate all drafts that haven't been rated yet, using batching."""
unrated = self.db.unrated_drafts(limit=limit)
if not unrated:
console.print("All drafts already rated.")
return 0
console.print(f"Rating [bold]{len(unrated)}[/] drafts in batches of {batch_size}...")
count = 0
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task("Analyzing...", total=len(unrated))
for i in range(0, len(unrated), batch_size):
batch = unrated[i:i + batch_size]
names = ", ".join(d.name.split("-")[-1][:12] for d in batch)
progress.update(task, description=f"Batch: {names}")
n = self.rate_batch(batch, batch_size=batch_size)
count += n
progress.advance(task, advance=len(batch))
in_tok, out_tok = self.db.total_tokens_used()
console.print(
f"Rated [bold green]{count}[/] drafts "
f"| Total tokens used: {in_tok:,} in + {out_tok:,} out"
)
return count
def compare_drafts(self, draft_names: list[str]) -> str:
"""Compare multiple drafts and return analysis text."""
parts = []
for name in draft_names:
draft = self.db.get_draft(name)
if draft is None:
console.print(f"[yellow]Skipping unknown draft: {name}[/]")
continue
parts.append(f"### {draft.title}\n**{name}**\n{draft.abstract}")
if len(parts) < 2:
return "Need at least 2 valid drafts to compare."
prompt = COMPARE_PROMPT.format(
drafts_section="\n\n---\n\n".join(parts)
)
try:
text, _, _ = self._call_claude(prompt, max_tokens=2048)
return text
except anthropic.APIError as e:
return f"Error: {e}"

405
src/ietf_analyzer/cli.py Normal file
View File

@@ -0,0 +1,405 @@
"""CLI entry point — all user-facing commands."""
from __future__ import annotations
import click
from rich.console import Console
from rich.table import Table
from .config import Config
from .db import Database
console = Console()
def _get_config() -> Config:
cfg = Config.load()
return cfg
@click.group()
@click.version_option(version="0.1.0")
def main():
"""IETF Draft Analyzer — track, categorize, and rate AI/agent Internet-Drafts."""
pass
# ── fetch ────────────────────────────────────────────────────────────────────
@main.command()
@click.option("--keywords", "-k", multiple=True, help="Extra keywords to search for")
@click.option("--since", "-s", help="Only fetch drafts newer than this date (YYYY-MM-DD)")
@click.option("--download-text/--no-download-text", default=True, help="Download full text of drafts")
def fetch(keywords: tuple[str, ...], since: str | None, download_text: bool):
"""Fetch AI/agent drafts from IETF Datatracker."""
from .fetcher import Fetcher
cfg = _get_config()
db = Database(cfg)
fetcher = Fetcher(cfg)
kw_list = list(cfg.search_keywords)
if keywords:
kw_list.extend(keywords)
try:
drafts = fetcher.search_drafts(keywords=kw_list, since=since)
for draft in drafts:
db.upsert_draft(draft)
console.print(f"Stored [bold green]{len(drafts)}[/] drafts in database")
if download_text:
missing = db.drafts_without_text()
if missing:
console.print(f"Downloading text for [bold]{len(missing)}[/] drafts...")
texts = fetcher.download_texts(missing)
for name, text in texts.items():
draft = db.get_draft(name)
if draft:
draft.full_text = text
db.upsert_draft(draft)
finally:
fetcher.close()
db.close()
# ── list ─────────────────────────────────────────────────────────────────────
@main.command("list")
@click.option("--limit", "-n", default=30, help="Number of drafts to show")
@click.option("--sort", "-s", default="time DESC", help="Sort order (e.g. 'time DESC', 'name ASC')")
def list_drafts(limit: int, sort: str):
"""List tracked drafts."""
cfg = _get_config()
db = Database(cfg)
try:
drafts = db.list_drafts(limit=limit, order_by=sort)
total = db.count_drafts()
table = Table(title=f"Tracked Drafts ({total} total, showing {len(drafts)})")
table.add_column("Date", style="dim", width=10)
table.add_column("Name", style="cyan", max_width=55)
table.add_column("Title", max_width=50)
table.add_column("Pg", justify="right", width=4)
table.add_column("Text", justify="center", width=4)
table.add_column("Rated", justify="center", width=5)
for d in drafts:
has_text = "\u2713" if d.full_text else ""
rated = "\u2713" if db.get_rating(d.name) else ""
table.add_row(d.date, d.name, d.title[:50], str(d.pages or ""), has_text, rated)
console.print(table)
finally:
db.close()
# ── search ───────────────────────────────────────────────────────────────────
@main.command()
@click.argument("query")
@click.option("--limit", "-n", default=20, help="Max results")
def search(query: str, limit: int):
"""Full-text search across stored drafts."""
cfg = _get_config()
db = Database(cfg)
try:
results = db.search_drafts(query, limit=limit)
if not results:
console.print(f"No results for [bold]{query}[/]")
return
table = Table(title=f"Search: {query} ({len(results)} results)")
table.add_column("Date", style="dim", width=10)
table.add_column("Name", style="cyan")
table.add_column("Title")
for d in results:
table.add_row(d.date, d.name, d.title[:60])
console.print(table)
finally:
db.close()
# ── show ─────────────────────────────────────────────────────────────────────
@main.command()
@click.argument("name")
def show(name: str):
"""Show detailed info for a draft."""
from .reports import Reporter
cfg = _get_config()
db = Database(cfg)
reporter = Reporter(cfg, db)
try:
draft = db.get_draft(name)
if draft is None:
console.print(f"[red]Draft not found: {name}[/]")
return
rating = db.get_rating(name)
console.print(f"\n[bold]{draft.title}[/]")
console.print(f"[dim]{draft.name}[/] rev {draft.rev} | {draft.date} | {draft.pages or '?'} pages")
console.print(f"Group: {draft.group or 'individual'} | {draft.datatracker_url}")
console.print(f"\n[italic]{draft.abstract}[/]\n")
if rating:
console.print("[bold]AI Assessment[/]")
console.print(f" Score: [bold green]{rating.composite_score:.1f}[/]")
console.print(f" Summary: {rating.summary}\n")
table = Table(show_header=True)
table.add_column("Dimension", width=12)
table.add_column("Score", justify="center", width=7)
table.add_column("Notes")
table.add_row("Novelty", f"{rating.novelty}/5", rating.novelty_note)
table.add_row("Maturity", f"{rating.maturity}/5", rating.maturity_note)
table.add_row("Overlap", f"{rating.overlap}/5", rating.overlap_note)
table.add_row("Momentum", f"{rating.momentum}/5", rating.momentum_note)
table.add_row("Relevance", f"{rating.relevance}/5", rating.relevance_note)
console.print(table)
if rating.categories:
console.print(f"\nCategories: {', '.join(rating.categories)}")
else:
console.print("[dim]Not yet rated — run: ietf analyze {name}[/]")
# Save detailed report too
path = reporter.draft_detail(name)
if path:
console.print(f"\n[dim]Report saved: {path}[/]")
finally:
db.close()
# ── analyze ──────────────────────────────────────────────────────────────────
@main.command()
@click.argument("name", required=False)
@click.option("--all", "analyze_all", is_flag=True, help="Analyze all unrated drafts")
@click.option("--limit", "-n", default=50, help="Max drafts to analyze (with --all)")
def analyze(name: str | None, analyze_all: bool, limit: int):
"""Analyze and rate drafts using Claude."""
from .analyzer import Analyzer
cfg = _get_config()
db = Database(cfg)
analyzer = Analyzer(cfg, db)
try:
if analyze_all:
count = analyzer.rate_all_unrated(limit=limit)
console.print(f"Analyzed [bold green]{count}[/] drafts")
elif name:
rating = analyzer.rate_draft(name)
if rating:
console.print(f"\n[bold green]Rating for {name}:[/]")
console.print(f" Score: {rating.composite_score:.1f}")
console.print(f" Summary: {rating.summary}")
console.print(f" Novelty={rating.novelty} Maturity={rating.maturity} "
f"Overlap={rating.overlap} Momentum={rating.momentum} "
f"Relevance={rating.relevance}")
else:
console.print("[red]Analysis failed[/]")
else:
console.print("Provide a draft name or use --all")
finally:
db.close()
# ── compare ──────────────────────────────────────────────────────────────────
@main.command()
@click.argument("names", nargs=-1, required=True)
def compare(names: tuple[str, ...]):
"""Compare multiple drafts for overlap and unique contributions."""
from .analyzer import Analyzer
cfg = _get_config()
db = Database(cfg)
analyzer = Analyzer(cfg, db)
try:
result = analyzer.compare_drafts(list(names))
console.print(result)
finally:
db.close()
# ── embed ────────────────────────────────────────────────────────────────────
@main.command()
def embed():
"""Generate embeddings for all drafts (requires Ollama)."""
from .embeddings import Embedder
cfg = _get_config()
db = Database(cfg)
embedder = Embedder(cfg, db)
try:
count = embedder.embed_all_missing()
console.print(f"Embedded [bold green]{count}[/] drafts")
finally:
db.close()
# ── similar ──────────────────────────────────────────────────────────────────
@main.command()
@click.argument("name")
@click.option("--top", "-n", default=10, help="Number of similar drafts to show")
def similar(name: str, top: int):
"""Find drafts most similar to a given draft."""
from .embeddings import Embedder
cfg = _get_config()
db = Database(cfg)
embedder = Embedder(cfg, db)
try:
results = embedder.find_similar(name, top_n=top)
if not results:
console.print(f"[yellow]No similar drafts found (need embeddings — run `ietf embed` first)[/]")
return
table = Table(title=f"Drafts similar to {name}")
table.add_column("Similarity", justify="right", width=10)
table.add_column("Draft", style="cyan")
table.add_column("Title")
for sim_name, score in results:
draft = db.get_draft(sim_name)
title = draft.title[:60] if draft else ""
table.add_row(f"{score:.3f}", sim_name, title)
console.print(table)
finally:
db.close()
# ── clusters ─────────────────────────────────────────────────────────────────
@main.command()
@click.option("--threshold", "-t", default=0.85, help="Similarity threshold for clustering")
def clusters(threshold: float):
"""Find clusters of highly similar (potentially overlapping) drafts."""
from .embeddings import Embedder
cfg = _get_config()
db = Database(cfg)
embedder = Embedder(cfg, db)
try:
cluster_list = embedder.find_clusters(threshold=threshold)
if not cluster_list:
console.print("No clusters found at this threshold.")
return
console.print(f"\n[bold]Found {len(cluster_list)} clusters[/] (threshold={threshold})\n")
for i, cluster in enumerate(cluster_list, 1):
console.print(f"[bold cyan]Cluster {i}[/] ({len(cluster)} drafts):")
for name in cluster:
draft = db.get_draft(name)
title = draft.title[:60] if draft else ""
console.print(f" - {name} [dim]{title}[/]")
console.print()
finally:
db.close()
# ── report ───────────────────────────────────────────────────────────────────
@main.group()
def report():
"""Generate markdown reports."""
pass
@report.command()
def overview():
"""Overview table of all rated drafts."""
from .reports import Reporter
cfg = _get_config()
db = Database(cfg)
reporter = Reporter(cfg, db)
try:
path = reporter.overview()
console.print(f"Report saved: [bold]{path}[/]")
finally:
db.close()
@report.command()
def landscape():
"""Category-grouped landscape view."""
from .reports import Reporter
cfg = _get_config()
db = Database(cfg)
reporter = Reporter(cfg, db)
try:
path = reporter.landscape()
console.print(f"Report saved: [bold]{path}[/]")
finally:
db.close()
@report.command()
@click.option("--days", "-d", default=7, help="Look back N days")
def digest(days: int):
"""What's new digest."""
from .reports import Reporter
cfg = _get_config()
db = Database(cfg)
reporter = Reporter(cfg, db)
try:
path = reporter.digest(since_days=days)
console.print(f"Report saved: [bold]{path}[/]")
finally:
db.close()
# ── config ───────────────────────────────────────────────────────────────────
@main.command("config")
@click.option("--set", "set_key", nargs=2, help="Set a config key (e.g. --set claude_model claude-opus-4-20250514)")
def config_cmd(set_key: tuple[str, str] | None):
"""Show or modify configuration."""
from dataclasses import asdict
cfg = _get_config()
if set_key:
key, value = set_key
if hasattr(cfg, key):
# Coerce types
current = getattr(cfg, key)
if isinstance(current, float):
value = float(value)
elif isinstance(current, int):
value = int(value)
elif isinstance(current, list):
import json
value = json.loads(value)
setattr(cfg, key, value)
cfg.save()
console.print(f"Set [bold]{key}[/] = {value}")
else:
console.print(f"[red]Unknown config key: {key}[/]")
else:
from dataclasses import asdict
for key, val in asdict(cfg).items():
console.print(f" [bold]{key}:[/] {val}")

View File

@@ -0,0 +1,44 @@
"""Configuration management."""
from __future__ import annotations
import json
from dataclasses import dataclass, field, asdict
from pathlib import Path
DEFAULT_DATA_DIR = Path(__file__).resolve().parent.parent.parent / "data"
CONFIG_FILE = DEFAULT_DATA_DIR / "config.json"
DEFAULT_KEYWORDS = [
"agent",
"ai-agent",
"llm",
"autonomous",
"machine-learning",
"artificial-intelligence",
]
@dataclass
class Config:
data_dir: str = str(DEFAULT_DATA_DIR)
db_path: str = str(DEFAULT_DATA_DIR / "drafts.db")
ollama_url: str = "http://localhost:11434"
ollama_embed_model: str = "nomic-embed-text"
claude_model: str = "claude-sonnet-4-20250514"
search_keywords: list[str] = field(default_factory=lambda: list(DEFAULT_KEYWORDS))
# Only fetch drafts newer than this (ISO date string)
fetch_since: str = "2024-01-01"
# Polite delay between API requests (seconds)
fetch_delay: float = 0.5
def save(self) -> None:
Path(self.data_dir).mkdir(parents=True, exist_ok=True)
CONFIG_FILE.write_text(json.dumps(asdict(self), indent=2))
@classmethod
def load(cls) -> Config:
if CONFIG_FILE.exists():
data = json.loads(CONFIG_FILE.read_text())
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
return cls()

375
src/ietf_analyzer/db.py Normal file
View File

@@ -0,0 +1,375 @@
"""SQLite database layer with FTS5 full-text search."""
from __future__ import annotations
import json
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
import numpy as np
from .config import Config
from .models import Draft, Rating
SCHEMA = """
CREATE TABLE IF NOT EXISTS drafts (
name TEXT PRIMARY KEY,
rev TEXT NOT NULL,
title TEXT NOT NULL,
abstract TEXT NOT NULL DEFAULT '',
time TEXT,
dt_id INTEGER,
pages INTEGER,
words INTEGER,
"group" TEXT,
group_uri TEXT,
expires TEXT,
ad TEXT,
shepherd TEXT,
states TEXT DEFAULT '[]', -- JSON array
full_text TEXT,
categories TEXT DEFAULT '[]', -- JSON array
tags TEXT DEFAULT '[]', -- JSON array
fetched_at TEXT
);
CREATE TABLE IF NOT EXISTS ratings (
draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
novelty INTEGER NOT NULL,
maturity INTEGER NOT NULL,
overlap INTEGER NOT NULL,
momentum INTEGER NOT NULL,
relevance INTEGER NOT NULL,
summary TEXT NOT NULL DEFAULT '',
novelty_note TEXT DEFAULT '',
maturity_note TEXT DEFAULT '',
overlap_note TEXT DEFAULT '',
momentum_note TEXT DEFAULT '',
relevance_note TEXT DEFAULT '',
categories TEXT DEFAULT '[]', -- JSON array
rated_at TEXT
);
CREATE TABLE IF NOT EXISTS embeddings (
draft_name TEXT PRIMARY KEY REFERENCES drafts(name),
model TEXT NOT NULL,
vector BLOB NOT NULL, -- numpy float32 array as bytes
created_at TEXT
);
CREATE TABLE IF NOT EXISTS llm_cache (
draft_name TEXT NOT NULL,
prompt_hash TEXT NOT NULL,
model TEXT NOT NULL,
request_json TEXT NOT NULL, -- full prompt sent
response_json TEXT NOT NULL, -- raw Claude response
input_tokens INTEGER,
output_tokens INTEGER,
created_at TEXT,
PRIMARY KEY (draft_name, prompt_hash)
);
CREATE VIRTUAL TABLE IF NOT EXISTS drafts_fts USING fts5(
name, title, abstract, full_text,
content='drafts',
content_rowid='rowid'
);
-- Triggers to keep FTS index in sync
CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
END;
CREATE TRIGGER IF NOT EXISTS drafts_ad AFTER DELETE ON drafts BEGIN
INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
END;
CREATE TRIGGER IF NOT EXISTS drafts_au AFTER UPDATE ON drafts BEGIN
INSERT INTO drafts_fts(drafts_fts, rowid, name, title, abstract, full_text)
VALUES ('delete', old.rowid, old.name, old.title, old.abstract, old.full_text);
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
VALUES (new.rowid, new.name, new.title, new.abstract, new.full_text);
END;
"""
class Database:
def __init__(self, config: Config | None = None):
self.config = config or Config.load()
self.db_path = self.config.db_path
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
self._conn: sqlite3.Connection | None = None
@property
def conn(self) -> sqlite3.Connection:
if self._conn is None:
self._conn = sqlite3.connect(self.db_path)
self._conn.row_factory = sqlite3.Row
self._conn.execute("PRAGMA journal_mode=WAL")
self._conn.execute("PRAGMA foreign_keys=ON")
self._conn.executescript(SCHEMA)
return self._conn
def close(self) -> None:
if self._conn:
self._conn.close()
self._conn = None
# --- Drafts ---
def upsert_draft(self, draft: Draft) -> None:
self.conn.execute(
"""INSERT INTO drafts (name, rev, title, abstract, time, dt_id, pages, words,
"group", group_uri, expires, ad, shepherd, states, full_text, categories, tags, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(name) DO UPDATE SET
rev=excluded.rev, title=excluded.title, abstract=excluded.abstract,
time=excluded.time, dt_id=excluded.dt_id, pages=excluded.pages,
words=excluded.words, "group"=excluded."group", group_uri=excluded.group_uri,
expires=excluded.expires, ad=excluded.ad, shepherd=excluded.shepherd,
states=excluded.states,
full_text=COALESCE(excluded.full_text, full_text),
categories=excluded.categories, tags=excluded.tags,
fetched_at=excluded.fetched_at
""",
(
draft.name, draft.rev, draft.title, draft.abstract, draft.time,
draft.dt_id, draft.pages, draft.words, draft.group, draft.group_uri,
draft.expires, draft.ad, draft.shepherd,
json.dumps(draft.states), draft.full_text,
json.dumps(draft.categories), json.dumps(draft.tags),
draft.fetched_at or datetime.now(timezone.utc).isoformat(),
),
)
self.conn.commit()
def get_draft(self, name: str) -> Draft | None:
row = self.conn.execute("SELECT * FROM drafts WHERE name = ?", (name,)).fetchone()
if row is None:
return None
return self._row_to_draft(row)
def list_drafts(
self,
limit: int = 100,
offset: int = 0,
order_by: str = "time DESC",
) -> list[Draft]:
# Sanitize order_by to prevent injection
allowed = {"time", "name", "title", "pages", "words", "fetched_at"}
parts = order_by.split()
col = parts[0] if parts else "time"
direction = parts[1].upper() if len(parts) > 1 else "DESC"
if col not in allowed:
col = "time"
if direction not in ("ASC", "DESC"):
direction = "DESC"
safe_order = f'"{col}" {direction}' if col == "group" else f"{col} {direction}"
rows = self.conn.execute(
f"SELECT * FROM drafts ORDER BY {safe_order} LIMIT ? OFFSET ?",
(limit, offset),
).fetchall()
return [self._row_to_draft(r) for r in rows]
def count_drafts(self) -> int:
return self.conn.execute("SELECT COUNT(*) FROM drafts").fetchone()[0]
def search_drafts(self, query: str, limit: int = 50) -> list[Draft]:
rows = self.conn.execute(
"""SELECT d.* FROM drafts d
JOIN drafts_fts f ON d.rowid = f.rowid
WHERE drafts_fts MATCH ?
ORDER BY rank
LIMIT ?""",
(query, limit),
).fetchall()
return [self._row_to_draft(r) for r in rows]
def drafts_without_text(self, limit: int = 100) -> list[Draft]:
rows = self.conn.execute(
"SELECT * FROM drafts WHERE full_text IS NULL LIMIT ?", (limit,)
).fetchall()
return [self._row_to_draft(r) for r in rows]
# --- Ratings ---
def upsert_rating(self, rating: Rating) -> None:
self.conn.execute(
"""INSERT INTO ratings (draft_name, novelty, maturity, overlap, momentum, relevance,
summary, novelty_note, maturity_note, overlap_note, momentum_note, relevance_note,
categories, rated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(draft_name) DO UPDATE SET
novelty=excluded.novelty, maturity=excluded.maturity, overlap=excluded.overlap,
momentum=excluded.momentum, relevance=excluded.relevance, summary=excluded.summary,
novelty_note=excluded.novelty_note, maturity_note=excluded.maturity_note,
overlap_note=excluded.overlap_note, momentum_note=excluded.momentum_note,
relevance_note=excluded.relevance_note, categories=excluded.categories,
rated_at=excluded.rated_at
""",
(
rating.draft_name, rating.novelty, rating.maturity, rating.overlap,
rating.momentum, rating.relevance, rating.summary,
rating.novelty_note, rating.maturity_note, rating.overlap_note,
rating.momentum_note, rating.relevance_note,
json.dumps(rating.categories),
rating.rated_at or datetime.now(timezone.utc).isoformat(),
),
)
self.conn.commit()
def get_rating(self, draft_name: str) -> Rating | None:
row = self.conn.execute(
"SELECT * FROM ratings WHERE draft_name = ?", (draft_name,)
).fetchone()
if row is None:
return None
return self._row_to_rating(row)
def unrated_drafts(self, limit: int = 100) -> list[Draft]:
rows = self.conn.execute(
"""SELECT d.* FROM drafts d
LEFT JOIN ratings r ON d.name = r.draft_name
WHERE r.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
return [self._row_to_draft(r) for r in rows]
def drafts_with_ratings(self, limit: int = 200) -> list[tuple[Draft, Rating]]:
rows = self.conn.execute(
"""SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance,
r.summary, r.novelty_note, r.maturity_note, r.overlap_note,
r.momentum_note, r.relevance_note, r.categories as r_categories, r.rated_at
FROM drafts d
JOIN ratings r ON d.name = r.draft_name
ORDER BY (r.novelty * 0.30 + r.relevance * 0.25 + r.maturity * 0.20
+ r.momentum * 0.15 + (6 - r.overlap) * 0.10) DESC
LIMIT ?""",
(limit,),
).fetchall()
results = []
for r in rows:
draft = self._row_to_draft(r)
rating = Rating(
draft_name=r["draft_name"] if "draft_name" in r.keys() else draft.name,
novelty=r["novelty"], maturity=r["maturity"], overlap=r["overlap"],
momentum=r["momentum"], relevance=r["relevance"], summary=r["summary"],
novelty_note=r["novelty_note"], maturity_note=r["maturity_note"],
overlap_note=r["overlap_note"], momentum_note=r["momentum_note"],
relevance_note=r["relevance_note"],
categories=json.loads(r["r_categories"]) if r["r_categories"] else [],
rated_at=r["rated_at"],
)
results.append((draft, rating))
return results
# --- Embeddings ---
def store_embedding(self, draft_name: str, model: str, vector: np.ndarray) -> None:
self.conn.execute(
"""INSERT INTO embeddings (draft_name, model, vector, created_at)
VALUES (?, ?, ?, ?)
ON CONFLICT(draft_name) DO UPDATE SET
model=excluded.model, vector=excluded.vector, created_at=excluded.created_at
""",
(draft_name, model, vector.astype(np.float32).tobytes(),
datetime.now(timezone.utc).isoformat()),
)
self.conn.commit()
def get_embedding(self, draft_name: str) -> np.ndarray | None:
row = self.conn.execute(
"SELECT vector FROM embeddings WHERE draft_name = ?", (draft_name,)
).fetchone()
if row is None:
return None
return np.frombuffer(row["vector"], dtype=np.float32)
def all_embeddings(self) -> dict[str, np.ndarray]:
rows = self.conn.execute("SELECT draft_name, vector FROM embeddings").fetchall()
return {
r["draft_name"]: np.frombuffer(r["vector"], dtype=np.float32)
for r in rows
}
def drafts_without_embeddings(self, limit: int = 500) -> list[str]:
rows = self.conn.execute(
"""SELECT d.name FROM drafts d
LEFT JOIN embeddings e ON d.name = e.draft_name
WHERE e.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
return [r["name"] for r in rows]
# --- LLM Cache ---
def cache_response(
self, draft_name: str, prompt_hash: str, model: str,
request_json: str, response_json: str,
input_tokens: int = 0, output_tokens: int = 0,
) -> None:
self.conn.execute(
"""INSERT INTO llm_cache (draft_name, prompt_hash, model, request_json,
response_json, input_tokens, output_tokens, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(draft_name, prompt_hash) DO UPDATE SET
model=excluded.model, response_json=excluded.response_json,
input_tokens=excluded.input_tokens, output_tokens=excluded.output_tokens,
created_at=excluded.created_at
""",
(draft_name, prompt_hash, model, request_json, response_json,
input_tokens, output_tokens, datetime.now(timezone.utc).isoformat()),
)
self.conn.commit()
def get_cached_response(self, draft_name: str, prompt_hash: str) -> str | None:
row = self.conn.execute(
"SELECT response_json FROM llm_cache WHERE draft_name = ? AND prompt_hash = ?",
(draft_name, prompt_hash),
).fetchone()
return row["response_json"] if row else None
def total_tokens_used(self) -> tuple[int, int]:
row = self.conn.execute(
"SELECT COALESCE(SUM(input_tokens),0), COALESCE(SUM(output_tokens),0) FROM llm_cache"
).fetchone()
return (row[0], row[1])
# --- Helpers ---
@staticmethod
def _row_to_draft(row: sqlite3.Row) -> Draft:
d = dict(row)
return Draft(
name=d["name"], rev=d["rev"], title=d["title"], abstract=d["abstract"],
time=d["time"], dt_id=d.get("dt_id"), pages=d.get("pages"),
words=d.get("words"), group=d.get("group"), group_uri=d.get("group_uri"),
expires=d.get("expires"), ad=d.get("ad"), shepherd=d.get("shepherd"),
states=json.loads(d.get("states") or "[]"),
full_text=d.get("full_text"),
categories=json.loads(d.get("categories") or "[]"),
tags=json.loads(d.get("tags") or "[]"),
fetched_at=d.get("fetched_at"),
)
@staticmethod
def _row_to_rating(row: sqlite3.Row) -> Rating:
d = dict(row)
return Rating(
draft_name=d["draft_name"], novelty=d["novelty"], maturity=d["maturity"],
overlap=d["overlap"], momentum=d["momentum"], relevance=d["relevance"],
summary=d["summary"],
novelty_note=d.get("novelty_note", ""),
maturity_note=d.get("maturity_note", ""),
overlap_note=d.get("overlap_note", ""),
momentum_note=d.get("momentum_note", ""),
relevance_note=d.get("relevance_note", ""),
categories=json.loads(d.get("categories") or "[]"),
rated_at=d.get("rated_at"),
)

View File

@@ -0,0 +1,136 @@
"""Embedding generation via Ollama and similarity computation."""
from __future__ import annotations
import numpy as np
import ollama as ollama_lib
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn
from .config import Config
from .db import Database
console = Console()
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
dot = np.dot(a, b)
norm = np.linalg.norm(a) * np.linalg.norm(b)
if norm == 0:
return 0.0
return float(dot / norm)
class Embedder:
def __init__(self, config: Config | None = None, db: Database | None = None):
self.config = config or Config.load()
self.db = db or Database(self.config)
self.client = ollama_lib.Client(host=self.config.ollama_url)
def embed_text(self, text: str) -> np.ndarray:
"""Generate an embedding for a single text string."""
# Truncate to ~8k tokens worth of text (roughly 32k chars)
truncated = text[:32000]
resp = self.client.embed(model=self.config.ollama_embed_model, input=truncated)
return np.array(resp["embeddings"][0], dtype=np.float32)
def embed_draft(self, draft_name: str) -> np.ndarray | None:
"""Generate and store an embedding for a draft using its abstract + title."""
draft = self.db.get_draft(draft_name)
if draft is None:
console.print(f"[red]Draft not found: {draft_name}[/]")
return None
# Combine title + abstract + beginning of full text for richer embedding
parts = [draft.title, draft.abstract]
if draft.full_text:
# Include first ~4k chars of body
parts.append(draft.full_text[:4000])
text = "\n\n".join(p for p in parts if p)
vec = self.embed_text(text)
self.db.store_embedding(draft_name, self.config.ollama_embed_model, vec)
return vec
def embed_all_missing(self) -> int:
"""Generate embeddings for all drafts that don't have one yet."""
missing = self.db.drafts_without_embeddings()
if not missing:
console.print("All drafts already have embeddings.")
return 0
count = 0
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task("Generating embeddings...", total=len(missing))
for name in missing:
try:
self.embed_draft(name)
count += 1
except Exception as e:
console.print(f"[red]Failed to embed {name}: {e}[/]")
progress.advance(task)
console.print(f"Generated [bold green]{count}[/] embeddings")
return count
def find_similar(self, draft_name: str, top_n: int = 10) -> list[tuple[str, float]]:
"""Find the most similar drafts to a given draft."""
target_vec = self.db.get_embedding(draft_name)
if target_vec is None:
# Try generating it on the fly
target_vec = self.embed_draft(draft_name)
if target_vec is None:
return []
all_embeddings = self.db.all_embeddings()
similarities: list[tuple[str, float]] = []
for name, vec in all_embeddings.items():
if name == draft_name:
continue
sim = _cosine_similarity(target_vec, vec)
similarities.append((name, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_n]
def similarity_matrix(self) -> tuple[list[str], np.ndarray]:
"""Compute pairwise similarity matrix for all embedded drafts."""
all_embeddings = self.db.all_embeddings()
names = sorted(all_embeddings.keys())
n = len(names)
matrix = np.zeros((n, n), dtype=np.float32)
for i in range(n):
for j in range(i, n):
sim = _cosine_similarity(all_embeddings[names[i]], all_embeddings[names[j]])
matrix[i, j] = sim
matrix[j, i] = sim
return names, matrix
def find_clusters(self, threshold: float = 0.85) -> list[list[str]]:
"""Find clusters of highly similar drafts using simple greedy clustering."""
names, matrix = self.similarity_matrix()
if len(names) == 0:
return []
visited = set()
clusters: list[list[str]] = []
for i, name in enumerate(names):
if name in visited:
continue
cluster = [name]
visited.add(name)
for j in range(len(names)):
if names[j] not in visited and matrix[i, j] >= threshold:
cluster.append(names[j])
visited.add(names[j])
if len(cluster) > 1:
clusters.append(cluster)
return clusters

View File

@@ -0,0 +1,204 @@
"""Datatracker API client — search, fetch metadata, download full text."""
from __future__ import annotations
import time as time_mod
from datetime import datetime, timezone
import httpx
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn
from .config import Config
from .models import Draft
API_BASE = "https://datatracker.ietf.org/api/v1"
TEXT_BASE = "https://www.ietf.org/archive/id"
SEARCH_FIELDS = ("name__contains", "abstract__contains")
console = Console()
class Fetcher:
def __init__(self, config: Config | None = None):
self.config = config or Config.load()
self.client = httpx.Client(timeout=30, follow_redirects=True)
self._group_cache: dict[str, str] = {}
def close(self) -> None:
self.client.close()
# --- Search & fetch metadata ---
def search_drafts(
self,
keywords: list[str] | None = None,
since: str | None = None,
limit_per_keyword: int = 200,
) -> list[Draft]:
"""Search for drafts matching keywords. Deduplicates by name."""
keywords = keywords or self.config.search_keywords
since = since or self.config.fetch_since
seen: dict[str, Draft] = {}
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
# Search both name and abstract for each keyword
searches = []
for kw in keywords:
for field in SEARCH_FIELDS:
searches.append((kw, field))
task = progress.add_task("Searching Datatracker...", total=len(searches))
for kw, search_field in searches:
progress.update(task, description=f"Searching {search_field.split('__')[0]}: {kw}")
drafts = self._paginated_search(search_field, kw, since, limit_per_keyword)
for d in drafts:
if d.name not in seen:
seen[d.name] = d
progress.advance(task)
console.print(f"Found [bold green]{len(seen)}[/] unique drafts")
return list(seen.values())
def _paginated_search(
self,
search_field: str,
keyword: str,
since: str,
max_results: int,
) -> list[Draft]:
results: list[Draft] = []
offset = 0
page_size = 100
while offset < max_results:
params = {
"format": "json",
search_field: keyword,
"time__gte": since,
"type__slug": "draft",
"limit": min(page_size, max_results - offset),
"offset": offset,
}
try:
resp = self.client.get(f"{API_BASE}/doc/document/", params=params)
resp.raise_for_status()
except httpx.HTTPError as e:
console.print(f"[red]API error: {e}[/]")
break
data = resp.json()
objects = data.get("objects", [])
if not objects:
break
for obj in objects:
results.append(self._api_obj_to_draft(obj))
offset += len(objects)
if not data.get("meta", {}).get("next"):
break
time_mod.sleep(self.config.fetch_delay)
return results
def fetch_draft(self, name: str) -> Draft | None:
"""Fetch a single draft by name."""
try:
resp = self.client.get(
f"{API_BASE}/doc/document/{name}/", params={"format": "json"}
)
resp.raise_for_status()
return self._api_obj_to_draft(resp.json())
except httpx.HTTPError as e:
console.print(f"[red]Error fetching {name}: {e}[/]")
return None
# --- Full text ---
def download_full_text(self, draft: Draft) -> str | None:
"""Download the plain text of a draft."""
url = draft.text_url
try:
resp = self.client.get(url)
resp.raise_for_status()
return resp.text
except httpx.HTTPError:
# Try without revision if it fails
try:
alt_url = f"{TEXT_BASE}/{draft.name}.txt"
resp = self.client.get(alt_url)
resp.raise_for_status()
return resp.text
except httpx.HTTPError as e:
console.print(f"[dim]Could not download text for {draft.name}: {e}[/]")
return None
def download_texts(self, drafts: list[Draft]) -> dict[str, str]:
"""Download full text for multiple drafts. Returns {name: text}."""
results: dict[str, str] = {}
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task("Downloading draft texts...", total=len(drafts))
for draft in drafts:
text = self.download_full_text(draft)
if text:
results[draft.name] = text
progress.advance(task)
time_mod.sleep(self.config.fetch_delay)
console.print(f"Downloaded [bold green]{len(results)}[/] / {len(drafts)} texts")
return results
# --- Group resolution ---
def resolve_group(self, group_uri: str) -> str:
"""Resolve a group API URI to a group acronym/name."""
if not group_uri:
return ""
if group_uri in self._group_cache:
return self._group_cache[group_uri]
try:
resp = self.client.get(
f"https://datatracker.ietf.org{group_uri}", params={"format": "json"}
)
resp.raise_for_status()
name = resp.json().get("acronym", resp.json().get("name", ""))
self._group_cache[group_uri] = name
time_mod.sleep(self.config.fetch_delay)
return name
except httpx.HTTPError:
return ""
# --- Helpers ---
def _api_obj_to_draft(self, obj: dict) -> Draft:
return Draft(
name=obj.get("name", ""),
rev=obj.get("rev", "00"),
title=obj.get("title", ""),
abstract=obj.get("abstract", "").strip(),
time=obj.get("time", ""),
dt_id=obj.get("id"),
pages=obj.get("pages"),
words=obj.get("words"),
group=None, # Resolved lazily
group_uri=obj.get("group", ""),
expires=obj.get("expires"),
ad=obj.get("ad"),
shepherd=obj.get("shepherd"),
states=[s for s in (obj.get("states") or []) if isinstance(s, str)],
fetched_at=datetime.now(timezone.utc).isoformat(),
)

View File

@@ -0,0 +1,72 @@
"""Data models for drafts, ratings, and categories."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class Draft:
name: str # e.g. "draft-zheng-dispatch-agent-identity-management"
rev: str # e.g. "00"
title: str
abstract: str
time: str # ISO datetime from API
dt_id: int | None = None # Datatracker document ID
pages: int | None = None
words: int | None = None
group: str | None = None # Working group name (resolved)
group_uri: str | None = None # Raw API URI
expires: str | None = None
ad: str | None = None # Area director URI
shepherd: str | None = None
states: list[str] = field(default_factory=list)
full_text: str | None = None
categories: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
fetched_at: str | None = None
@property
def text_url(self) -> str:
return f"https://www.ietf.org/archive/id/{self.name}-{self.rev}.txt"
@property
def datatracker_url(self) -> str:
return f"https://datatracker.ietf.org/doc/{self.name}/"
@property
def date(self) -> str:
"""Return just the date portion of time."""
if self.time:
return self.time[:10]
return ""
@dataclass
class Rating:
draft_name: str
novelty: int # 1-5
maturity: int # 1-5
overlap: int # 1-5 (5 = highly overlapping with others)
momentum: int # 1-5
relevance: int # 1-5
summary: str # 2-4 sentence AI summary
novelty_note: str = ""
maturity_note: str = ""
overlap_note: str = ""
momentum_note: str = ""
relevance_note: str = ""
categories: list[str] = field(default_factory=list)
rated_at: str | None = None
@property
def composite_score(self) -> float:
"""Weighted composite: novelty and relevance matter most."""
return (
self.novelty * 0.30
+ self.relevance * 0.25
+ self.maturity * 0.20
+ self.momentum * 0.15
+ (6 - self.overlap) * 0.10 # Invert: less overlap = better
)

View File

@@ -0,0 +1,177 @@
"""Markdown report generation."""
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from .config import Config
from .db import Database
from .models import Draft, Rating
STAR = {1: "\u2581", 2: "\u2583", 3: "\u2585", 4: "\u2587", 5: "\u2588"}
def _bar(score: int) -> str:
return STAR.get(score, "?")
def _score_str(rating: Rating) -> str:
return f"{rating.composite_score:.1f}"
class Reporter:
def __init__(self, config: Config | None = None, db: Database | None = None):
self.config = config or Config.load()
self.db = db or Database(self.config)
self.output_dir = Path(self.config.data_dir) / "reports"
self.output_dir.mkdir(parents=True, exist_ok=True)
def overview(self) -> str:
"""Generate a sortable overview table of all rated drafts."""
pairs = self.db.drafts_with_ratings()
total = self.db.count_drafts()
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
lines = [
f"# IETF AI/Agent Draft Overview",
f"*Generated {now}{len(pairs)} rated / {total} tracked drafts*\n",
"| Score | Draft | Date | N | M | O | Mom | R | Summary |",
"|------:|-------|------|:-:|:-:|:-:|:---:|:-:|---------|",
]
for draft, rating in pairs:
name_link = f"[{draft.name}]({draft.datatracker_url})"
lines.append(
f"| {_score_str(rating)} | {name_link} | {draft.date} "
f"| {_bar(rating.novelty)} | {_bar(rating.maturity)} "
f"| {_bar(rating.overlap)} | {_bar(rating.momentum)} "
f"| {_bar(rating.relevance)} | {rating.summary[:80]}... |"
)
lines.append("\n*N=Novelty, M=Maturity, O=Overlap, Mom=Momentum, R=Relevance (block height = score 1-5)*")
report = "\n".join(lines)
path = self.output_dir / "overview.md"
path.write_text(report)
return str(path)
def landscape(self) -> str:
"""Generate a category-grouped landscape report."""
pairs = self.db.drafts_with_ratings()
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
# Group by category
by_cat: dict[str, list[tuple[Draft, Rating]]] = {}
for draft, rating in pairs:
cats = rating.categories or ["Uncategorized"]
for cat in cats:
by_cat.setdefault(cat, []).append((draft, rating))
lines = [
f"# IETF AI/Agent Draft Landscape",
f"*Generated {now}*\n",
]
for cat in sorted(by_cat.keys()):
items = by_cat[cat]
items.sort(key=lambda x: x[1].composite_score, reverse=True)
lines.append(f"\n## {cat} ({len(items)} drafts)\n")
for draft, rating in items:
lines.append(
f"- **[{draft.name}]({draft.datatracker_url})** "
f"(score: {_score_str(rating)}) — {rating.summary[:100]}"
)
report = "\n".join(lines)
path = self.output_dir / "landscape.md"
path.write_text(report)
return str(path)
def draft_detail(self, draft_name: str) -> str:
"""Generate a detailed report for a single draft."""
draft = self.db.get_draft(draft_name)
if draft is None:
return ""
rating = self.db.get_rating(draft_name)
lines = [
f"# {draft.title}",
f"**{draft.name}** rev {draft.rev}\n",
f"- **Date:** {draft.date}",
f"- **Pages:** {draft.pages or '?'}",
f"- **Group:** {draft.group or 'individual'}",
f"- **Datatracker:** {draft.datatracker_url}",
f"- **Text:** {draft.text_url}\n",
f"## Abstract\n{draft.abstract}\n",
]
if rating:
lines.extend([
f"## AI Assessment (score: {_score_str(rating)})\n",
f"**Summary:** {rating.summary}\n",
f"| Dimension | Score | Notes |",
f"|-----------|:-----:|-------|",
f"| Novelty | {rating.novelty}/5 | {rating.novelty_note} |",
f"| Maturity | {rating.maturity}/5 | {rating.maturity_note} |",
f"| Overlap | {rating.overlap}/5 | {rating.overlap_note} |",
f"| Momentum | {rating.momentum}/5 | {rating.momentum_note} |",
f"| Relevance | {rating.relevance}/5 | {rating.relevance_note} |",
f"\n**Categories:** {', '.join(rating.categories) if rating.categories else 'none'}",
])
else:
lines.append("*Not yet rated — run `ietf analyze` to generate a rating.*")
report = "\n".join(lines)
path = self.output_dir / f"{draft_name}.md"
path.write_text(report)
return str(path)
def digest(self, since_days: int = 7) -> str:
"""Generate a digest of recently fetched/updated drafts."""
from datetime import timedelta
cutoff = (datetime.now(timezone.utc) - timedelta(days=since_days)).isoformat()
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
# Get recent drafts by fetched_at
all_drafts = self.db.list_drafts(limit=500, order_by="fetched_at DESC")
recent = [d for d in all_drafts if d.fetched_at and d.fetched_at >= cutoff]
lines = [
f"# Weekly Digest — IETF AI/Agent Drafts",
f"*Generated {now} — showing drafts fetched in last {since_days} days*\n",
f"**{len(recent)} drafts** in this period\n",
]
if not recent:
lines.append("No new drafts found. Run `ietf fetch` to update.")
else:
# Split into rated and unrated
rated = []
unrated = []
for d in recent:
r = self.db.get_rating(d.name)
if r:
rated.append((d, r))
else:
unrated.append(d)
if rated:
rated.sort(key=lambda x: x[1].composite_score, reverse=True)
lines.append("## Top Rated New Drafts\n")
for draft, rating in rated[:10]:
lines.append(
f"1. **[{draft.name}]({draft.datatracker_url})** "
f"(score: {_score_str(rating)}) — {rating.summary[:120]}"
)
if unrated:
lines.append(f"\n## Awaiting Analysis ({len(unrated)} drafts)\n")
for d in unrated[:20]:
lines.append(f"- [{d.name}]({d.datatracker_url}) — {d.title}")
report = "\n".join(lines)
path = self.output_dir / "digest.md"
path.write_text(report)
return str(path)