v0.3.0: Gap-to-Draft pipeline, Living Standards Observatory, blog series

Gap-to-Draft Pipeline (ietf pipeline):
- Context builder assembles ideas, RFC foundations, similar drafts, ecosystem vision
- Generator produces outlines + sections using rich context with Claude
- Quality gates: novelty (embedding similarity), references, format, self-rating
- Family coordinator generates 5-draft ecosystem (AEM/ATD/HITL/AEPB/APAE)
- I-D formatter with proper headers, references, 72-char wrapping

Living Standards Observatory (ietf observatory):
- Source abstraction with IETF + W3C fetchers
- 7-step update pipeline: snapshot, fetch, analyze, embed, ideas, gaps, record
- Static GitHub Pages dashboard (explorer, gap tracker, timeline)
- Weekly CI/CD automation via GitHub Actions

Also includes:
- 361 drafts (expanded from 260 with 6 new keywords), 403 authors, 1,262 ideas, 12 gaps
- Blog series (8 posts planned), reports, arXiv paper figures
- Agent team infrastructure (CLAUDE.md, scripts, dev journal)
- 5 new DB tables, schema migration, ~15 new query methods

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-04 00:48:57 +01:00
parent be9cf9c5d9
commit d6beb9c0a0
87 changed files with 24471 additions and 401 deletions

View File

@@ -10,7 +10,7 @@ from pathlib import Path
import numpy as np
from .config import Config
from .models import Author, Draft, Rating
from .models import Author, Draft, Rating, normalize_category
SCHEMA = """
CREATE TABLE IF NOT EXISTS drafts (
@@ -117,6 +117,73 @@ CREATE TABLE IF NOT EXISTS gaps (
analyzed_at TEXT
);
-- Cross-references (RFC, draft, BCP references found in draft text)
CREATE TABLE IF NOT EXISTS draft_refs (
draft_name TEXT NOT NULL REFERENCES drafts(name),
ref_type TEXT NOT NULL, -- 'rfc', 'draft', 'bcp'
ref_id TEXT NOT NULL, -- e.g. '8259', 'draft-ietf-httpbis-semantics', 'BCP14'
UNIQUE(draft_name, ref_type, ref_id)
);
CREATE INDEX IF NOT EXISTS idx_draft_refs_ref ON draft_refs(ref_type, ref_id);
-- Generated drafts from gap-to-draft pipeline
CREATE TABLE IF NOT EXISTS generated_drafts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
gap_topic TEXT NOT NULL,
draft_name TEXT NOT NULL,
title TEXT NOT NULL,
abstract TEXT NOT NULL DEFAULT '',
outline_json TEXT DEFAULT '{}',
sections_json TEXT DEFAULT '[]',
full_text TEXT,
family_name TEXT DEFAULT '',
family_role TEXT DEFAULT '',
version INTEGER DEFAULT 0,
rating_json TEXT DEFAULT '{}',
novelty_score REAL DEFAULT 0.0,
quality_score REAL DEFAULT 0.0,
status TEXT DEFAULT 'draft',
created_at TEXT
);
CREATE TABLE IF NOT EXISTS generation_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
family_name TEXT DEFAULT '',
gap_ids TEXT DEFAULT '[]',
total_input_tokens INTEGER DEFAULT 0,
total_output_tokens INTEGER DEFAULT 0,
model_used TEXT DEFAULT '',
status TEXT DEFAULT 'running',
started_at TEXT,
completed_at TEXT
);
-- Observatory tables
CREATE TABLE IF NOT EXISTS sources (
name TEXT PRIMARY KEY,
last_fetch TEXT,
doc_count INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS observatory_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_at TEXT NOT NULL,
total_docs INTEGER DEFAULT 0,
new_since_last INTEGER DEFAULT 0,
changed_gaps INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS gap_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_id INTEGER REFERENCES observatory_snapshots(id),
gap_topic TEXT NOT NULL,
gap_description TEXT NOT NULL,
severity TEXT DEFAULT 'medium',
status TEXT DEFAULT 'open',
recorded_at TEXT
);
-- Triggers to keep FTS index in sync
CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
@@ -152,8 +219,23 @@ class Database:
self._conn.execute("PRAGMA journal_mode=WAL")
self._conn.execute("PRAGMA foreign_keys=ON")
self._conn.executescript(SCHEMA)
self._migrate_schema()
return self._conn
def _migrate_schema(self) -> None:
"""Additive migration — add columns if missing."""
cols = {r[1] for r in self._conn.execute("PRAGMA table_info(drafts)").fetchall()}
migrations = [
("source", "TEXT DEFAULT 'ietf'"),
("source_id", "TEXT DEFAULT ''"),
("source_url", "TEXT DEFAULT ''"),
("doc_status", "TEXT DEFAULT ''"),
]
for col, typedef in migrations:
if col not in cols:
self._conn.execute(f"ALTER TABLE drafts ADD COLUMN {col} {typedef}")
self._conn.commit()
def close(self) -> None:
if self._conn:
self._conn.close()
@@ -303,7 +385,7 @@ class Database:
novelty_note=r["novelty_note"], maturity_note=r["maturity_note"],
overlap_note=r["overlap_note"], momentum_note=r["momentum_note"],
relevance_note=r["relevance_note"],
categories=json.loads(r["r_categories"]) if r["r_categories"] else [],
categories=[normalize_category(c) for c in json.loads(r["r_categories"])] if r["r_categories"] else [],
rated_at=r["rated_at"],
)
results.append((draft, rating))
@@ -503,6 +585,30 @@ class Database:
).fetchall()
return [(r["org_a"], r["org_b"], r["shared"]) for r in rows]
def org_data_raw(self) -> list[tuple[str, int, str]]:
"""Return (affiliation, person_id, draft_name) for all draft_authors with affiliation."""
rows = self.conn.execute(
"SELECT affiliation, person_id, draft_name FROM draft_authors WHERE affiliation != ''"
).fetchall()
return [(r[0], r[1], r[2]) for r in rows]
def author_draft_counts(self) -> dict[int, int]:
"""Return {person_id: draft_count} for all authors."""
rows = self.conn.execute(
"SELECT person_id, COUNT(*) FROM draft_authors GROUP BY person_id"
).fetchall()
return {r[0]: r[1] for r in rows}
def author_draft_sets(self) -> dict[int, set[str]]:
"""Return {person_id: set(draft_names)} for all authors."""
rows = self.conn.execute(
"SELECT person_id, draft_name FROM draft_authors"
).fetchall()
result: dict[int, set[str]] = {}
for r in rows:
result.setdefault(r[0], set()).add(r[1])
return result
# --- Ideas ---
def insert_ideas(self, draft_name: str, ideas: list[dict]) -> None:
@@ -529,7 +635,9 @@ class Database:
rows = self.conn.execute(
"""SELECT d.name FROM drafts d
LEFT JOIN ideas i ON d.name = i.draft_name
WHERE i.draft_name IS NULL
LEFT JOIN llm_cache lc ON d.name = lc.draft_name
AND lc.request_json LIKE 'batch-ideas[%'
WHERE i.draft_name IS NULL AND lc.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
@@ -565,6 +673,314 @@ class Database:
"category": r["category"], "evidence": r["evidence"],
"severity": r["severity"]} for r in rows]
# --- Refs ---
def insert_refs(self, draft_name: str, refs: list[tuple[str, str]]) -> None:
"""Insert cross-references for a draft. refs = [(ref_type, ref_id), ...]."""
for ref_type, ref_id in refs:
self.conn.execute(
"""INSERT OR IGNORE INTO draft_refs (draft_name, ref_type, ref_id)
VALUES (?, ?, ?)""",
(draft_name, ref_type, ref_id),
)
self.conn.commit()
def get_refs_for_draft(self, draft_name: str) -> list[tuple[str, str]]:
"""Return [(ref_type, ref_id)] for a draft."""
rows = self.conn.execute(
"SELECT ref_type, ref_id FROM draft_refs WHERE draft_name = ?",
(draft_name,),
).fetchall()
return [(r["ref_type"], r["ref_id"]) for r in rows]
def top_referenced(self, ref_type: str = "rfc", limit: int = 30) -> list[tuple[str, int, list[str]]]:
"""Return (ref_id, count, [draft_names]) for most-referenced items."""
rows = self.conn.execute(
"""SELECT ref_id, COUNT(*) as cnt,
GROUP_CONCAT(draft_name, '||') as drafts
FROM draft_refs
WHERE ref_type = ?
GROUP BY ref_id
ORDER BY cnt DESC
LIMIT ?""",
(ref_type, limit),
).fetchall()
return [
(r["ref_id"], r["cnt"], r["drafts"].split("||") if r["drafts"] else [])
for r in rows
]
def drafts_referencing(self, ref_type: str, ref_id: str) -> list[str]:
"""Return draft names that reference a specific RFC/draft/BCP."""
rows = self.conn.execute(
"SELECT draft_name FROM draft_refs WHERE ref_type = ? AND ref_id = ?",
(ref_type, ref_id),
).fetchall()
return [r["draft_name"] for r in rows]
def ref_counts_by_draft(self) -> list[tuple[str, int, int, int]]:
"""Return (draft_name, rfc_count, draft_count, bcp_count) for all drafts with refs."""
rows = self.conn.execute(
"""SELECT draft_name,
SUM(CASE WHEN ref_type = 'rfc' THEN 1 ELSE 0 END) as rfcs,
SUM(CASE WHEN ref_type = 'draft' THEN 1 ELSE 0 END) as drafts,
SUM(CASE WHEN ref_type = 'bcp' THEN 1 ELSE 0 END) as bcps
FROM draft_refs
GROUP BY draft_name
ORDER BY rfcs DESC"""
).fetchall()
return [(r["draft_name"], r["rfcs"], r["drafts"], r["bcps"]) for r in rows]
def drafts_without_refs(self, limit: int = 500) -> list[str]:
"""Return draft names that have full_text but no refs extracted yet."""
rows = self.conn.execute(
"""SELECT d.name FROM drafts d
LEFT JOIN draft_refs dr ON d.name = dr.draft_name
WHERE d.full_text IS NOT NULL AND dr.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
return [r["name"] for r in rows]
def ref_stats(self) -> dict:
"""Return summary stats for refs table."""
row = self.conn.execute(
"""SELECT COUNT(DISTINCT draft_name) as drafts_with_refs,
COUNT(*) as total_refs,
SUM(CASE WHEN ref_type = 'rfc' THEN 1 ELSE 0 END) as rfc_refs,
SUM(CASE WHEN ref_type = 'draft' THEN 1 ELSE 0 END) as draft_refs,
SUM(CASE WHEN ref_type = 'bcp' THEN 1 ELSE 0 END) as bcp_refs,
COUNT(DISTINCT ref_id) as unique_refs
FROM draft_refs"""
).fetchone()
return dict(row)
# --- Generated Drafts ---
def upsert_generated_draft(self, data: dict) -> int:
"""Insert or update a generated draft. Returns row id."""
now = datetime.now(timezone.utc).isoformat()
existing = self.conn.execute(
"SELECT id FROM generated_drafts WHERE draft_name = ? AND version = ?",
(data["draft_name"], data.get("version", 0)),
).fetchone()
if existing:
self.conn.execute(
"""UPDATE generated_drafts SET
gap_topic=?, title=?, abstract=?, outline_json=?,
sections_json=?, full_text=?, family_name=?, family_role=?,
rating_json=?, novelty_score=?, quality_score=?, status=?
WHERE id=?""",
(data["gap_topic"], data["title"], data.get("abstract", ""),
json.dumps(data.get("outline", {})), json.dumps(data.get("sections", [])),
data.get("full_text"), data.get("family_name", ""),
data.get("family_role", ""), json.dumps(data.get("rating", {})),
data.get("novelty_score", 0.0), data.get("quality_score", 0.0),
data.get("status", "draft"), existing["id"]),
)
self.conn.commit()
return existing["id"]
else:
cur = self.conn.execute(
"""INSERT INTO generated_drafts
(gap_topic, draft_name, title, abstract, outline_json, sections_json,
full_text, family_name, family_role, version, rating_json,
novelty_score, quality_score, status, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(data["gap_topic"], data["draft_name"], data["title"],
data.get("abstract", ""), json.dumps(data.get("outline", {})),
json.dumps(data.get("sections", [])), data.get("full_text"),
data.get("family_name", ""), data.get("family_role", ""),
data.get("version", 0), json.dumps(data.get("rating", {})),
data.get("novelty_score", 0.0), data.get("quality_score", 0.0),
data.get("status", "draft"), now),
)
self.conn.commit()
return cur.lastrowid
def get_generated_drafts(self, status: str | None = None) -> list[dict]:
query = "SELECT * FROM generated_drafts"
params: list = []
if status:
query += " WHERE status = ?"
params.append(status)
query += " ORDER BY created_at DESC"
rows = self.conn.execute(query, params).fetchall()
return [dict(r) for r in rows]
def get_generated_draft(self, draft_id: int) -> dict | None:
row = self.conn.execute(
"SELECT * FROM generated_drafts WHERE id = ?", (draft_id,)
).fetchone()
return dict(row) if row else None
def get_family_drafts(self, family_name: str) -> list[dict]:
rows = self.conn.execute(
"SELECT * FROM generated_drafts WHERE family_name = ? ORDER BY family_role",
(family_name,),
).fetchall()
return [dict(r) for r in rows]
def log_generation_run(self, data: dict) -> int:
now = datetime.now(timezone.utc).isoformat()
cur = self.conn.execute(
"""INSERT INTO generation_runs
(family_name, gap_ids, total_input_tokens, total_output_tokens,
model_used, status, started_at)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
(data.get("family_name", ""), json.dumps(data.get("gap_ids", [])),
data.get("total_input_tokens", 0), data.get("total_output_tokens", 0),
data.get("model_used", ""), data.get("status", "running"), now),
)
self.conn.commit()
return cur.lastrowid
def update_generation_run(self, run_id: int, **kwargs) -> None:
sets = []
params = []
for k, v in kwargs.items():
sets.append(f"{k} = ?")
params.append(v)
if not sets:
return
params.append(run_id)
self.conn.execute(
f"UPDATE generation_runs SET {', '.join(sets)} WHERE id = ?", params
)
self.conn.commit()
# --- Observatory ---
def upsert_source(self, name: str, doc_count: int = 0) -> None:
now = datetime.now(timezone.utc).isoformat()
self.conn.execute(
"""INSERT INTO sources (name, last_fetch, doc_count)
VALUES (?, ?, ?)
ON CONFLICT(name) DO UPDATE SET last_fetch=excluded.last_fetch, doc_count=excluded.doc_count""",
(name, now, doc_count),
)
self.conn.commit()
def get_source(self, name: str) -> dict | None:
row = self.conn.execute("SELECT * FROM sources WHERE name = ?", (name,)).fetchone()
return dict(row) if row else None
def all_sources(self) -> list[dict]:
rows = self.conn.execute("SELECT * FROM sources ORDER BY name").fetchall()
return [dict(r) for r in rows]
def create_snapshot(self) -> int:
now = datetime.now(timezone.utc).isoformat()
total = self.count_drafts()
# Count new since last snapshot
last = self.conn.execute(
"SELECT snapshot_at FROM observatory_snapshots ORDER BY id DESC LIMIT 1"
).fetchone()
new_count = 0
if last:
new_count = self.conn.execute(
"SELECT COUNT(*) FROM drafts WHERE fetched_at > ?", (last["snapshot_at"],)
).fetchone()[0]
else:
new_count = total
cur = self.conn.execute(
"""INSERT INTO observatory_snapshots (snapshot_at, total_docs, new_since_last, changed_gaps)
VALUES (?, ?, ?, 0)""",
(now, total, new_count),
)
self.conn.commit()
return cur.lastrowid
def record_gap_history(self, snapshot_id: int, gaps: list[dict]) -> None:
now = datetime.now(timezone.utc).isoformat()
for g in gaps:
self.conn.execute(
"""INSERT INTO gap_history (snapshot_id, gap_topic, gap_description, severity, status, recorded_at)
VALUES (?, ?, ?, ?, ?, ?)""",
(snapshot_id, g["topic"], g["description"],
g.get("severity", "medium"), g.get("status", "open"), now),
)
self.conn.commit()
def gap_history_timeline(self) -> list[dict]:
rows = self.conn.execute(
"""SELECT gh.*, os.snapshot_at FROM gap_history gh
JOIN observatory_snapshots os ON gh.snapshot_id = os.id
ORDER BY os.snapshot_at, gh.gap_topic"""
).fetchall()
return [dict(r) for r in rows]
def get_snapshots(self, limit: int = 20) -> list[dict]:
rows = self.conn.execute(
"SELECT * FROM observatory_snapshots ORDER BY id DESC LIMIT ?", (limit,)
).fetchall()
return [dict(r) for r in rows]
def drafts_by_source(self, source: str, limit: int = 500) -> list[Draft]:
rows = self.conn.execute(
"SELECT * FROM drafts WHERE source = ? ORDER BY time DESC LIMIT ?",
(source, limit),
).fetchall()
return [self._row_to_draft(r) for r in rows]
# --- WG/Status ---
def draft_adoption_status(self) -> list[dict]:
"""Return adoption status for all drafts based on naming convention.
Returns list of dicts: {name, title, time, wg_adopted, wg_name, stream}
"""
import re
rows = self.conn.execute(
'SELECT name, title, time FROM drafts'
).fetchall()
results = []
for r in rows:
name = r["name"]
wg_adopted = False
wg_name = ""
stream = "individual"
# Primary signal: draft-ietf-{wg}-* naming convention
m = re.match(r'^draft-ietf-(\w+)-', name)
if m:
wg_adopted = True
wg_name = m.group(1)
stream = "ietf"
elif name.startswith("draft-irtf-"):
m2 = re.match(r'^draft-irtf-(\w+)-', name)
wg_name = m2.group(1) if m2 else ""
stream = "irtf"
results.append({
"name": name,
"title": r["title"],
"time": r["time"],
"wg_adopted": wg_adopted,
"wg_name": wg_name,
"stream": stream,
})
return results
def revision_velocity(self) -> list[dict]:
"""Return revision data for all drafts.
Returns list of dicts: {name, title, time, rev, rev_int}
"""
rows = self.conn.execute(
"SELECT name, title, time, rev FROM drafts"
).fetchall()
return [
{
"name": r["name"],
"title": r["title"],
"time": r["time"],
"rev": r["rev"],
"rev_int": int(r["rev"]) if r["rev"].isdigit() else 0,
}
for r in rows
]
# --- Helpers ---
@staticmethod
@@ -580,11 +996,16 @@ class Database:
categories=json.loads(d.get("categories") or "[]"),
tags=json.loads(d.get("tags") or "[]"),
fetched_at=d.get("fetched_at"),
source=d.get("source", "ietf"),
source_id=d.get("source_id", ""),
source_url=d.get("source_url", ""),
doc_status=d.get("doc_status", ""),
)
@staticmethod
def _row_to_rating(row: sqlite3.Row) -> Rating:
d = dict(row)
raw_cats = json.loads(d.get("categories") or "[]")
return Rating(
draft_name=d["draft_name"], novelty=d["novelty"], maturity=d["maturity"],
overlap=d["overlap"], momentum=d["momentum"], relevance=d["relevance"],
@@ -594,6 +1015,6 @@ class Database:
overlap_note=d.get("overlap_note", ""),
momentum_note=d.get("momentum_note", ""),
relevance_note=d.get("relevance_note", ""),
categories=json.loads(d.get("categories") or "[]"),
categories=[normalize_category(c) for c in raw_cats],
rated_at=d.get("rated_at"),
)