v0.3.0: Gap-to-Draft pipeline, Living Standards Observatory, blog series

Gap-to-Draft Pipeline (ietf pipeline):
- Context builder assembles ideas, RFC foundations, similar drafts, ecosystem vision
- Generator produces outlines + sections using rich context with Claude
- Quality gates: novelty (embedding similarity), references, format, self-rating
- Family coordinator generates 5-draft ecosystem (AEM/ATD/HITL/AEPB/APAE)
- I-D formatter with proper headers, references, 72-char wrapping

Living Standards Observatory (ietf observatory):
- Source abstraction with IETF + W3C fetchers
- 7-step update pipeline: snapshot, fetch, analyze, embed, ideas, gaps, record
- Static GitHub Pages dashboard (explorer, gap tracker, timeline)
- Weekly CI/CD automation via GitHub Actions

Also includes:
- 361 drafts (expanded from 260 with 6 new keywords), 403 authors, 1,262 ideas, 12 gaps
- Blog series (8 posts planned), reports, arXiv paper figures
- Agent team infrastructure (CLAUDE.md, scripts, dev journal)
- 5 new DB tables, schema migration, ~15 new query methods

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-04 00:48:57 +01:00
parent be9cf9c5d9
commit d6beb9c0a0
87 changed files with 24471 additions and 401 deletions

View File

@@ -77,7 +77,7 @@ Abstract: {abstract}
{text_excerpt}
Return 3-8 ideas. Focus on CONCRETE technical contributions, not general statements.
Return 0-8 ideas. Only include CONCRETE, NOVEL technical contributions not restatements of the abstract or general goals. If the draft has no substantive technical ideas (e.g. it is a problem statement, administrative document, or off-topic), return an empty array [].
JSON array only, no fences."""
BATCH_IDEAS_PROMPT = """\
@@ -86,7 +86,7 @@ Per idea: {{"title":"short name","description":"1 sentence","type":"mechanism|pr
{drafts_block}
3-8 ideas per draft. CONCRETE technical contributions only.
0-8 ideas per draft. Only include CONCRETE, NOVEL technical contributions. If a draft has no substantive ideas, map it to an empty array. Do not pad with restatements of the abstract.
Return ONLY a JSON object like {{"draft-name":[...], ...}}, no fences."""
GAP_ANALYSIS_PROMPT = """\
@@ -397,16 +397,16 @@ class Analyzer:
count = 0
for d in drafts:
ideas = results.get(d.name, [])
if not isinstance(ideas, list):
ideas = [ideas] if ideas else []
self.db.cache_response(
d.name, _prompt_hash(f"batch-ideas-{phash}-{d.name}"),
self.config.claude_model_cheap if cheap else self.config.claude_model,
f"batch-ideas[{d.name}]", json.dumps(ideas),
in_tok // len(drafts), out_tok // len(drafts),
)
self.db.insert_ideas(d.name, ideas)
if ideas:
if not isinstance(ideas, list):
ideas = [ideas]
self.db.cache_response(
d.name, _prompt_hash(f"batch-ideas-{phash}-{d.name}"),
self.config.claude_model_cheap if cheap else self.config.claude_model,
f"batch-ideas[{d.name}]", json.dumps(ideas),
in_tok // len(drafts), out_tok // len(drafts),
)
self.db.insert_ideas(d.name, ideas)
count += 1
return count
except (json.JSONDecodeError, anthropic.APIError) as e:

File diff suppressed because it is too large Load Diff

View File

@@ -16,6 +16,12 @@ DEFAULT_KEYWORDS = [
"autonomous",
"machine-learning",
"artificial-intelligence",
"mcp",
"agentic",
"inference",
"generative",
"intelligent",
"aipref",
]
@@ -32,6 +38,15 @@ class Config:
fetch_since: str = "2024-01-01"
# Polite delay between API requests (seconds)
fetch_delay: float = 0.5
# Pipeline
generation_max_tokens: int = 4096
generation_model: str = "" # defaults to claude_model
# Observatory
observatory_sources: list[str] = field(default_factory=lambda: ["ietf"])
dashboard_dir: str = str(DEFAULT_DATA_DIR.parent / "docs")
w3c_groups: list[str] = field(default_factory=lambda: [
"webmachinelearning", "wot", "credentials", "did", "vc"
])
def save(self) -> None:
Path(self.data_dir).mkdir(parents=True, exist_ok=True)

View File

@@ -0,0 +1,981 @@
"""Static dashboard generator for GitHub Pages — Living Standards Observatory."""
from __future__ import annotations
import json
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from .config import Config
from .db import Database
from .models import Rating
console = None
def _get_console():
global console
if console is None:
from rich.console import Console
console = Console()
return console
class DashboardGenerator:
"""Generate a static GitHub Pages site under docs/."""
def __init__(self, config: Config | None = None, db: Database | None = None):
self.config = config or Config.load()
self.db = db or Database(self.config)
self.output_dir = Path(self.config.dashboard_dir)
def generate(self) -> str:
"""Generate full static site. Returns path to docs/."""
con = _get_console()
self.output_dir.mkdir(parents=True, exist_ok=True)
(self.output_dir / "observatory").mkdir(exist_ok=True)
(self.output_dir / "data").mkdir(exist_ok=True)
(self.output_dir / "assets").mkdir(exist_ok=True)
con.print("[bold]Generating dashboard...[/]")
self._generate_data_files()
con.print(" [green]OK[/] Data files")
self._generate_style()
con.print(" [green]OK[/] Styles")
self._generate_index()
con.print(" [green]OK[/] Index page")
self._generate_explorer()
con.print(" [green]OK[/] Explorer page")
self._generate_gaps_page()
con.print(" [green]OK[/] Gaps page")
self._generate_timeline_page()
con.print(" [green]OK[/] Timeline page")
con.print(f"\n[bold green]Dashboard generated at {self.output_dir}/[/]")
return str(self.output_dir)
# ── Data files ──────────────────────────────────────────────────────────
def _generate_data_files(self) -> None:
"""Write JSON data files to docs/data/."""
data_dir = self.output_dir / "data"
# observatory.json — key metrics
total = self.db.count_drafts()
sources = self.db.all_sources()
gaps = self.db.all_gaps()
snapshots = self.db.get_snapshots(limit=1)
unrated = len(self.db.unrated_drafts(limit=10000))
idea_count = self.db.idea_count()
author_count = self.db.author_count()
observatory_data = {
"total_docs": total,
"sources": {s["name"]: s["doc_count"] for s in sources},
"gaps_count": len(gaps),
"unrated": unrated,
"ideas": idea_count,
"authors": author_count,
"last_update": snapshots[0]["snapshot_at"] if snapshots else None,
}
(data_dir / "observatory.json").write_text(json.dumps(observatory_data, indent=2))
# drafts.json — all docs with ratings
pairs = self.db.drafts_with_ratings(limit=1000)
drafts_data = []
for d, r in pairs:
drafts_data.append({
"name": d.name,
"title": d.title,
"date": d.date,
"source": d.source or "ietf",
"url": d.source_url or d.datatracker_url,
"pages": d.pages or 0,
"group": d.group or "individual",
"score": round(r.composite_score, 2),
"novelty": r.novelty,
"maturity": r.maturity,
"overlap": r.overlap,
"momentum": r.momentum,
"relevance": r.relevance,
"categories": r.categories,
"summary": r.summary,
"novelty_note": r.novelty_note,
"maturity_note": r.maturity_note,
"overlap_note": r.overlap_note,
"momentum_note": r.momentum_note,
"relevance_note": r.relevance_note,
"doc_status": d.doc_status or "",
})
(data_dir / "drafts.json").write_text(json.dumps(drafts_data, indent=2))
# gaps.json — current gaps + history
gap_history = self.db.gap_history_timeline()
gaps_data = {
"current": gaps,
"history": gap_history,
}
(data_dir / "gaps.json").write_text(json.dumps(gaps_data, indent=2))
# timeline.json — monthly counts by source and category
all_drafts = self.db.list_drafts(limit=2000, order_by="time ASC")
rating_map = {d.name: r for d, r in pairs}
monthly: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
monthly_source: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
for d in all_drafts:
month = d.time[:7] if d.time else "unknown"
src = d.source or "ietf"
monthly_source[month][src] += 1
r = rating_map.get(d.name)
if r:
for c in r.categories:
monthly[month][c] += 1
months = sorted(set(list(monthly.keys()) + list(monthly_source.keys())))
all_cats: set[str] = set()
for mc in monthly.values():
all_cats.update(mc.keys())
all_sources_set: set[str] = set()
for ms in monthly_source.values():
all_sources_set.update(ms.keys())
timeline_data = {
"months": months,
"by_category": {m: dict(monthly.get(m, {})) for m in months},
"by_source": {m: dict(monthly_source.get(m, {})) for m in months},
"categories": sorted(all_cats),
"sources": sorted(all_sources_set),
}
(data_dir / "timeline.json").write_text(json.dumps(timeline_data, indent=2))
# meta.json
meta = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"version": "0.3.0",
"project": "IETF Living Standards Observatory",
}
(data_dir / "meta.json").write_text(json.dumps(meta, indent=2))
# ── Style ───────────────────────────────────────────────────────────────
def _generate_style(self) -> None:
"""Shared CSS."""
css = """\
:root {
--bg: #f5f7fa;
--card-bg: #ffffff;
--text: #1a1a2e;
--text-dim: #666;
--accent: #4a6cf7;
--accent-light: rgba(74,108,247,0.1);
--green: #10b981;
--orange: #f59e0b;
--red: #ef4444;
--border: #e5e7eb;
--shadow: 0 1px 4px rgba(0,0,0,0.08);
--radius: 10px;
}
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: var(--bg); color: var(--text);
line-height: 1.5;
}
a { color: var(--accent); text-decoration: none; }
a:hover { text-decoration: underline; }
/* Layout */
.container { max-width: 1200px; margin: 0 auto; padding: 20px; }
.header {
background: var(--card-bg); border-bottom: 1px solid var(--border);
padding: 16px 0; margin-bottom: 24px;
}
.header .container { display: flex; align-items: center; justify-content: space-between; }
.header h1 { font-size: 1.3rem; }
.header nav { display: flex; gap: 20px; font-size: 0.9rem; }
.header nav a { color: var(--text-dim); font-weight: 500; }
.header nav a:hover, .header nav a.active { color: var(--accent); text-decoration: none; }
/* Cards */
.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 16px; margin-bottom: 24px; }
.card {
background: var(--card-bg); border-radius: var(--radius);
padding: 20px; box-shadow: var(--shadow);
}
.card .label { font-size: 0.8rem; color: var(--text-dim); text-transform: uppercase; letter-spacing: 0.5px; }
.card .value { font-size: 2rem; font-weight: 700; margin-top: 4px; }
.card .sub { font-size: 0.8rem; color: var(--text-dim); margin-top: 4px; }
/* Tables */
.panel {
background: var(--card-bg); border-radius: var(--radius);
box-shadow: var(--shadow); overflow: hidden; margin-bottom: 24px;
}
.panel-header { padding: 16px 20px; border-bottom: 1px solid var(--border); font-weight: 600; }
table { width: 100%; border-collapse: collapse; }
th {
background: #f8f9fb; padding: 10px 12px; text-align: left;
font-size: 0.78rem; color: var(--text-dim); cursor: pointer; user-select: none;
white-space: nowrap; border-bottom: 2px solid var(--border);
}
th:hover { color: var(--accent); }
td { padding: 10px 12px; border-bottom: 1px solid #f0f0f0; font-size: 0.83rem; vertical-align: top; }
tr:hover { background: #fafbff; }
/* Controls */
.controls {
background: var(--card-bg); border-radius: var(--radius);
padding: 16px 20px; margin-bottom: 16px; box-shadow: var(--shadow);
}
.controls-row { display: flex; gap: 16px; align-items: center; flex-wrap: wrap; margin-bottom: 10px; }
.controls-row:last-child { margin-bottom: 0; }
.search-box {
flex: 1; min-width: 250px; padding: 8px 14px;
border: 1px solid var(--border); border-radius: 6px;
font-size: 0.9rem; outline: none;
}
.search-box:focus { border-color: var(--accent); box-shadow: 0 0 0 2px var(--accent-light); }
.slider-group { display: flex; align-items: center; gap: 6px; font-size: 0.8rem; color: var(--text-dim); }
.slider-group input[type=range] { width: 100px; cursor: pointer; }
.slider-val { font-weight: 600; min-width: 24px; text-align: center; }
/* Chips */
.chip-row { display: flex; flex-wrap: wrap; gap: 6px; }
.chip {
display: inline-block; padding: 3px 10px; border-radius: 12px;
font-size: 0.75rem; cursor: pointer; border: 1px solid var(--border);
background: var(--card-bg); transition: all 0.15s; user-select: none;
}
.chip.active { background: var(--accent); color: #fff; border-color: var(--accent); }
.chip:hover { border-color: var(--accent); }
/* Badges */
.score-badge {
display: inline-block; padding: 2px 8px; border-radius: 10px;
font-weight: 600; font-size: 0.8rem;
}
.score-high { background: #d4edda; color: #155724; }
.score-mid { background: #fff3cd; color: #856404; }
.score-low { background: #f8d7da; color: #721c24; }
.cat-badge {
display: inline-block; padding: 1px 7px; border-radius: 8px;
font-size: 0.68rem; margin: 1px 2px; background: #e8eaf6; color: #3949ab;
}
.source-badge {
display: inline-block; padding: 1px 7px; border-radius: 8px;
font-size: 0.68rem; margin: 1px 2px;
}
.source-ietf { background: #e3f2fd; color: #1565c0; }
.source-w3c { background: #fce4ec; color: #c62828; }
/* Severity */
.sev-critical { color: var(--red); font-weight: 600; }
.sev-high { color: var(--orange); font-weight: 600; }
.sev-medium { color: var(--text); }
.sev-low { color: var(--text-dim); }
/* Bar */
.bar { display: inline-block; height: 10px; border-radius: 3px; background: var(--accent); vertical-align: middle; }
/* Detail */
.detail-row td { padding: 12px 20px; background: #f8faff; }
.detail-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; max-width: 800px; }
.detail-item { font-size: 0.82rem; }
.detail-item strong { color: #333; }
.detail-item .note { color: var(--text-dim); font-size: 0.78rem; }
.summary-text { font-size: 0.82rem; color: #444; margin-top: 6px; line-height: 1.4; }
/* Chart container */
.chart-container {
background: var(--card-bg); border-radius: var(--radius);
box-shadow: var(--shadow); padding: 20px; margin-bottom: 24px;
}
/* Gap cards */
.gap-card {
background: var(--card-bg); border-radius: var(--radius);
border-left: 4px solid var(--accent); padding: 16px 20px;
box-shadow: var(--shadow); margin-bottom: 12px;
}
.gap-card h3 { font-size: 0.95rem; margin-bottom: 4px; }
.gap-card p { font-size: 0.83rem; color: var(--text-dim); margin-bottom: 4px; }
.gap-card .meta { font-size: 0.75rem; color: var(--text-dim); }
.gap-card.critical { border-left-color: var(--red); }
.gap-card.high { border-left-color: var(--orange); }
.dim { font-size: 0.75rem; color: var(--text-dim); }
.clickable { cursor: pointer; }
.reset-btn {
padding: 4px 12px; border: 1px solid var(--border); border-radius: 6px;
background: var(--card-bg); cursor: pointer; font-size: 0.78rem; color: var(--text-dim);
}
.reset-btn:hover { border-color: var(--accent); color: var(--accent); }
.result-count { font-size: 0.85rem; color: var(--text-dim); margin: 10px 0 8px; }
/* Timeline bars */
.tl-bar {
display: inline-block; height: 16px; border-radius: 3px;
vertical-align: middle; min-width: 2px;
}
@media (max-width: 768px) {
.cards { grid-template-columns: 1fr 1fr; }
.controls-row { flex-direction: column; align-items: stretch; }
.detail-grid { grid-template-columns: 1fr; }
}
"""
(self.output_dir / "assets" / "style.css").write_text(css)
# ── Shared HTML pieces ──────────────────────────────────────────────────
def _header_html(self, active: str = "") -> str:
def active_cls(page: str) -> str:
return ' class="active"' if page == active else ""
return f"""\
<div class="header">
<div class="container">
<h1>Living Standards Observatory</h1>
<nav>
<a href="../index.html"{active_cls("index")}>Dashboard</a>
<a href="explorer.html"{active_cls("explorer")}>Explorer</a>
<a href="gaps.html"{active_cls("gaps")}>Gaps</a>
<a href="timeline.html"{active_cls("timeline")}>Timeline</a>
</nav>
</div>
</div>"""
def _index_header_html(self) -> str:
return """\
<div class="header">
<div class="container">
<h1>Living Standards Observatory</h1>
<nav>
<a href="index.html" class="active">Dashboard</a>
<a href="observatory/explorer.html">Explorer</a>
<a href="observatory/gaps.html">Gaps</a>
<a href="observatory/timeline.html">Timeline</a>
</nav>
</div>
</div>"""
# ── Index page ──────────────────────────────────────────────────────────
def _generate_index(self) -> None:
"""Landing page with key metrics dashboard."""
html = f"""\
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Living Standards Observatory</title>
<link rel="stylesheet" href="assets/style.css">
</head>
<body>
{self._index_header_html()}
<div class="container">
<div class="cards" id="metricsCards">
<div class="card"><div class="label">Total Documents</div><div class="value" id="totalDocs">--</div><div class="sub" id="sourceSub"></div></div>
<div class="card"><div class="label">Standards Bodies</div><div class="value" id="sourceCount">--</div><div class="sub">Active sources</div></div>
<div class="card"><div class="label">Open Gaps</div><div class="value" id="gapCount">--</div><div class="sub">Identified coverage gaps</div></div>
<div class="card"><div class="label">Ideas Extracted</div><div class="value" id="ideaCount">--</div><div class="sub">Technical contributions</div></div>
<div class="card"><div class="label">Authors Tracked</div><div class="value" id="authorCount">--</div><div class="sub">Individual contributors</div></div>
<div class="card"><div class="label">Last Update</div><div class="value" id="lastUpdate" style="font-size:1rem">--</div><div class="sub" id="updateSub"></div></div>
</div>
<div class="panel">
<div class="panel-header">Top Rated Documents</div>
<table>
<thead>
<tr><th>Score</th><th>Document</th><th>Source</th><th>Date</th><th>Categories</th></tr>
</thead>
<tbody id="topDrafts"></tbody>
</table>
</div>
<div class="panel">
<div class="panel-header">Critical &amp; High Severity Gaps</div>
<div id="gapsList" style="padding: 16px;"></div>
</div>
</div>
<script>
function escHtml(s) {{ const d = document.createElement('div'); d.textContent = s || ''; return d.innerHTML; }}
function scoreBadge(s) {{
const cls = s >= 4.0 ? 'score-high' : s >= 3.0 ? 'score-mid' : 'score-low';
return '<span class="score-badge ' + cls + '">' + s.toFixed(1) + '</span>';
}}
async function init() {{
const [obs, drafts, gaps] = await Promise.all([
fetch('data/observatory.json').then(r => r.json()),
fetch('data/drafts.json').then(r => r.json()),
fetch('data/gaps.json').then(r => r.json()),
]);
// Metrics
document.getElementById('totalDocs').textContent = obs.total_docs;
const srcNames = Object.keys(obs.sources || {{}});
document.getElementById('sourceCount').textContent = srcNames.length || 1;
document.getElementById('sourceSub').textContent = srcNames.map(s => s.toUpperCase() + ': ' + (obs.sources[s] || 0)).join(' | ') || '';
document.getElementById('gapCount').textContent = obs.gaps_count;
document.getElementById('ideaCount').textContent = obs.ideas;
document.getElementById('authorCount').textContent = obs.authors;
if (obs.last_update) {{
document.getElementById('lastUpdate').textContent = obs.last_update.substring(0, 10);
}}
// Top drafts
const top = drafts.sort((a, b) => b.score - a.score).slice(0, 15);
const tbody = document.getElementById('topDrafts');
top.forEach(d => {{
const tr = document.createElement('tr');
const srcClass = 'source-' + (d.source || 'ietf');
tr.innerHTML =
'<td>' + scoreBadge(d.score) + '</td>' +
'<td><a href="' + escHtml(d.url) + '" target="_blank">' + escHtml(d.name) + '</a><br><span class="dim">' + escHtml(d.title.substring(0,80)) + '</span></td>' +
'<td><span class="source-badge ' + srcClass + '">' + (d.source || 'ietf').toUpperCase() + '</span></td>' +
'<td class="dim">' + d.date + '</td>' +
'<td>' + d.categories.map(c => '<span class="cat-badge">' + escHtml(c) + '</span>').join('') + '</td>';
tbody.appendChild(tr);
}});
// Gaps
const gapsList = document.getElementById('gapsList');
const critical = (gaps.current || []).filter(g => g.severity === 'critical' || g.severity === 'high');
if (critical.length === 0) {{
gapsList.innerHTML = '<p class="dim">No critical or high severity gaps found.</p>';
}} else {{
critical.forEach(g => {{
const cls = g.severity === 'critical' ? 'critical' : 'high';
gapsList.innerHTML +=
'<div class="gap-card ' + cls + '">' +
'<h3>' + escHtml(g.topic) + '</h3>' +
'<p>' + escHtml(g.description) + '</p>' +
'<div class="meta"><span class="sev-' + g.severity + '">' + g.severity.toUpperCase() + '</span> &middot; ' + escHtml(g.category || '') + '</div>' +
'</div>';
}});
}}
}}
init();
</script>
</body>
</html>"""
(self.output_dir / "index.html").write_text(html)
# ── Explorer page ───────────────────────────────────────────────────────
def _generate_explorer(self) -> None:
"""Multi-source draft browser with search, filters, score sliders."""
html = f"""\
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Explorer - Living Standards Observatory</title>
<link rel="stylesheet" href="../assets/style.css">
</head>
<body>
{self._header_html("explorer")}
<div class="container">
<div class="controls">
<div class="controls-row">
<input type="text" class="search-box" id="searchBox" placeholder="Search by name, title, summary, or keyword...">
<select id="sourceFilter" style="padding:8px;border:1px solid var(--border);border-radius:6px;font-size:0.85rem">
<option value="">All sources</option>
</select>
<div class="slider-group">Min score: <input type="range" id="minScore" min="1" max="5" step="0.1" value="1"><span class="slider-val" id="minScoreVal">1.0</span></div>
<div class="slider-group">Min novelty: <input type="range" id="minNovelty" min="1" max="5" step="1" value="1"><span class="slider-val" id="minNoveltyVal">1</span></div>
<div class="slider-group">Max overlap: <input type="range" id="maxOverlap" min="1" max="5" step="1" value="5"><span class="slider-val" id="maxOverlapVal">5</span></div>
<button class="reset-btn" onclick="resetFilters()">Reset</button>
</div>
<div class="controls-row">
<div class="chip-row" id="catChips"></div>
</div>
</div>
<div class="result-count" id="resultCount"></div>
<table>
<thead>
<tr>
<th onclick="sortBy('score')" width="60">Score <span class="sort-arrow" id="sort-score"></span></th>
<th onclick="sortBy('name')">Draft <span class="sort-arrow" id="sort-name"></span></th>
<th onclick="sortBy('source')" width="60">Src <span class="sort-arrow" id="sort-source"></span></th>
<th onclick="sortBy('date')" width="90">Date <span class="sort-arrow" id="sort-date"></span></th>
<th onclick="sortBy('novelty')" width="30">N</th>
<th onclick="sortBy('maturity')" width="30">M</th>
<th onclick="sortBy('overlap')" width="30">O</th>
<th onclick="sortBy('momentum')" width="30">Mom</th>
<th onclick="sortBy('relevance')" width="30">R</th>
<th>Categories</th>
</tr>
</thead>
<tbody id="tableBody"></tbody>
</table>
</div>
<script>
let DRAFTS = [];
let ALL_CATS = [];
let activeCats = new Set();
let sortField = 'score';
let sortAsc = false;
let expandedRow = null;
function escHtml(s) {{ const d = document.createElement('div'); d.textContent = s || ''; return d.innerHTML; }}
function scoreBadge(s) {{
const cls = s >= 4.0 ? 'score-high' : s >= 3.0 ? 'score-mid' : 'score-low';
return '<span class="score-badge ' + cls + '">' + s.toFixed(1) + '</span>';
}}
function dimBar(v) {{ return '<span class="bar" style="width:' + (v * 12) + 'px"></span> ' + v; }}
const searchBox = document.getElementById('searchBox');
const sourceFilter = document.getElementById('sourceFilter');
const minScore = document.getElementById('minScore');
const minNovelty = document.getElementById('minNovelty');
const maxOverlap = document.getElementById('maxOverlap');
searchBox.oninput = render;
sourceFilter.onchange = render;
minScore.oninput = () => {{ document.getElementById('minScoreVal').textContent = parseFloat(minScore.value).toFixed(1); render(); }};
minNovelty.oninput = () => {{ document.getElementById('minNoveltyVal').textContent = minNovelty.value; render(); }};
maxOverlap.oninput = () => {{ document.getElementById('maxOverlapVal').textContent = maxOverlap.value; render(); }};
function resetFilters() {{
searchBox.value = '';
sourceFilter.value = '';
minScore.value = 1; document.getElementById('minScoreVal').textContent = '1.0';
minNovelty.value = 1; document.getElementById('minNoveltyVal').textContent = '1';
maxOverlap.value = 5; document.getElementById('maxOverlapVal').textContent = '5';
activeCats.clear();
document.querySelectorAll('.chip').forEach(c => c.classList.remove('active'));
sortField = 'score'; sortAsc = false;
render();
}}
function sortBy(field) {{
if (sortField === field) sortAsc = !sortAsc;
else {{ sortField = field; sortAsc = field === 'name' || field === 'date'; }}
render();
}}
function cmp(a, b) {{
let va = a[sortField], vb = b[sortField];
if (typeof va === 'string') return sortAsc ? va.localeCompare(vb) : vb.localeCompare(va);
return sortAsc ? va - vb : vb - va;
}}
function render() {{
const q = searchBox.value.toLowerCase().trim();
const src = sourceFilter.value;
const ms = parseFloat(minScore.value);
const mn = parseInt(minNovelty.value);
const mo = parseInt(maxOverlap.value);
let filtered = DRAFTS.filter(d => {{
if (d.score < ms) return false;
if (d.novelty < mn) return false;
if (d.overlap > mo) return false;
if (src && (d.source || 'ietf') !== src) return false;
if (activeCats.size > 0 && !d.categories.some(c => activeCats.has(c))) return false;
if (q) {{
const hay = (d.name + ' ' + d.title + ' ' + d.summary + ' ' + d.categories.join(' ')).toLowerCase();
const words = q.split(/\\s+/);
if (!words.every(w => hay.includes(w))) return false;
}}
return true;
}});
filtered.sort(cmp);
document.querySelectorAll('.sort-arrow').forEach(el => el.textContent = '');
const arrow = document.getElementById('sort-' + sortField);
if (arrow) arrow.textContent = sortAsc ? '\\u25B2' : '\\u25BC';
const tbody = document.getElementById('tableBody');
tbody.innerHTML = '';
expandedRow = null;
filtered.forEach(d => {{
const tr = document.createElement('tr');
tr.className = 'clickable';
const srcClass = 'source-' + (d.source || 'ietf');
tr.innerHTML =
'<td>' + scoreBadge(d.score) + '</td>' +
'<td style="max-width:300px"><a href="' + escHtml(d.url) + '" target="_blank" onclick="event.stopPropagation()" style="color:var(--accent);font-weight:500">' + escHtml(d.name) + '</a>' +
'<br><span class="dim">' + escHtml(d.title.substring(0, 80)) + '</span></td>' +
'<td><span class="source-badge ' + srcClass + '">' + (d.source || 'ietf').toUpperCase() + '</span></td>' +
'<td class="dim">' + d.date + '</td>' +
'<td>' + dimBar(d.novelty) + '</td>' +
'<td>' + dimBar(d.maturity) + '</td>' +
'<td>' + dimBar(d.overlap) + '</td>' +
'<td>' + dimBar(d.momentum) + '</td>' +
'<td>' + dimBar(d.relevance) + '</td>' +
'<td>' + d.categories.map(c => '<span class="cat-badge">' + escHtml(c) + '</span>').join('') + '</td>';
tr.onclick = () => toggleDetail(tr, d);
tbody.appendChild(tr);
}});
document.getElementById('resultCount').textContent =
'Showing ' + filtered.length + ' of ' + DRAFTS.length + ' drafts';
}}
function toggleDetail(tr, d) {{
if (expandedRow) {{
expandedRow.previousElementSibling?.classList.remove('expanded');
expandedRow.remove();
if (expandedRow._draftName === d.name) {{ expandedRow = null; return; }}
}}
tr.classList.add('expanded');
const detail = document.createElement('tr');
detail.className = 'detail-row';
detail._draftName = d.name;
function detailItem(label, score, note) {{
return '<div class="detail-item"><strong>' + label + ':</strong> ' + score + '/5 ' +
'<span class="bar" style="width:' + (score * 16) + 'px"></span>' +
(note ? '<div class="note">' + escHtml(note) + '</div>' : '') + '</div>';
}}
detail.innerHTML = '<td colspan="10">' +
'<div class="summary-text"><strong>Summary:</strong> ' + escHtml(d.summary) + '</div>' +
'<div class="detail-grid" style="margin-top:10px">' +
detailItem('Novelty', d.novelty, d.novelty_note) +
detailItem('Maturity', d.maturity, d.maturity_note) +
detailItem('Overlap', d.overlap, d.overlap_note) +
detailItem('Momentum', d.momentum, d.momentum_note) +
detailItem('Relevance', d.relevance, d.relevance_note) +
'<div class="detail-item"><strong>Source:</strong> ' + (d.source || 'ietf').toUpperCase() + ' &middot; <strong>Pages:</strong> ' + d.pages + '</div>' +
'</div>' +
'<div style="margin-top:8px"><a href="' + escHtml(d.url) + '" target="_blank" style="color:var(--accent)">Open document \\u2192</a></div>' +
'</td>';
tr.after(detail);
expandedRow = detail;
}}
async function init() {{
DRAFTS = await fetch('../data/drafts.json').then(r => r.json());
// Build categories
const catSet = new Set();
const sources = new Set();
DRAFTS.forEach(d => {{
d.categories.forEach(c => catSet.add(c));
sources.add(d.source || 'ietf');
}});
ALL_CATS = [...catSet].sort();
// Source filter options
sources.forEach(s => {{
const opt = document.createElement('option');
opt.value = s;
opt.textContent = s.toUpperCase();
sourceFilter.appendChild(opt);
}});
// Category chips
const chipBox = document.getElementById('catChips');
ALL_CATS.forEach(cat => {{
const el = document.createElement('span');
el.className = 'chip';
const count = DRAFTS.filter(d => d.categories.includes(cat)).length;
el.innerHTML = escHtml(cat) + '<span style="font-size:0.65rem;opacity:0.7;margin-left:2px">(' + count + ')</span>';
el.onclick = () => {{
if (activeCats.has(cat)) {{ activeCats.delete(cat); el.classList.remove('active'); }}
else {{ activeCats.add(cat); el.classList.add('active'); }}
render();
}};
chipBox.appendChild(el);
}});
render();
}}
init();
</script>
</body>
</html>"""
(self.output_dir / "observatory" / "explorer.html").write_text(html)
# ── Gaps page ───────────────────────────────────────────────────────────
def _generate_gaps_page(self) -> None:
"""Gap tracker with fill-status over time."""
html = f"""\
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Gaps - Living Standards Observatory</title>
<link rel="stylesheet" href="../assets/style.css">
</head>
<body>
{self._header_html("gaps")}
<div class="container">
<h2 style="margin-bottom:16px">Coverage Gaps</h2>
<p class="dim" style="margin-bottom:20px">Areas, problems, or technical challenges not adequately addressed by existing standards documents.</p>
<div class="controls">
<div class="controls-row">
<select id="sevFilter" style="padding:8px;border:1px solid var(--border);border-radius:6px;font-size:0.85rem">
<option value="">All severities</option>
<option value="critical">Critical</option>
<option value="high">High</option>
<option value="medium">Medium</option>
<option value="low">Low</option>
</select>
<input type="text" class="search-box" id="gapSearch" placeholder="Filter gaps..." style="max-width:400px">
</div>
</div>
<div id="gapsList"></div>
<h2 style="margin:32px 0 16px">Gap History</h2>
<p class="dim" style="margin-bottom:20px">How gaps have evolved across observatory snapshots.</p>
<div class="panel">
<table>
<thead>
<tr><th>Snapshot</th><th>Topic</th><th>Severity</th><th>Status</th></tr>
</thead>
<tbody id="historyBody"></tbody>
</table>
</div>
</div>
<script>
function escHtml(s) {{ const d = document.createElement('div'); d.textContent = s || ''; return d.innerHTML; }}
let GAPS_DATA = null;
function renderGaps() {{
const sev = document.getElementById('sevFilter').value;
const q = document.getElementById('gapSearch').value.toLowerCase().trim();
const list = document.getElementById('gapsList');
list.innerHTML = '';
let current = GAPS_DATA.current || [];
if (sev) current = current.filter(g => g.severity === sev);
if (q) current = current.filter(g => (g.topic + ' ' + g.description + ' ' + (g.category || '')).toLowerCase().includes(q));
if (current.length === 0) {{
list.innerHTML = '<p class="dim" style="padding:16px">No gaps match the current filters.</p>';
return;
}}
const order = {{'critical': 0, 'high': 1, 'medium': 2, 'low': 3}};
current.sort((a, b) => (order[a.severity] || 2) - (order[b.severity] || 2));
current.forEach(g => {{
const cls = (g.severity === 'critical' || g.severity === 'high') ? g.severity : '';
list.innerHTML +=
'<div class="gap-card ' + cls + '">' +
'<h3>' + escHtml(g.topic) + '</h3>' +
'<p>' + escHtml(g.description) + '</p>' +
'<div class="meta">' +
'<span class="sev-' + g.severity + '">' + (g.severity || 'medium').toUpperCase() + '</span>' +
(g.category ? ' &middot; ' + escHtml(g.category) : '') +
(g.evidence ? '<br><em>' + escHtml(g.evidence) + '</em>' : '') +
'</div></div>';
}});
}}
async function init() {{
GAPS_DATA = await fetch('../data/gaps.json').then(r => r.json());
document.getElementById('sevFilter').onchange = renderGaps;
document.getElementById('gapSearch').oninput = renderGaps;
renderGaps();
// History table
const history = GAPS_DATA.history || [];
const tbody = document.getElementById('historyBody');
if (history.length === 0) {{
tbody.innerHTML = '<tr><td colspan="4" class="dim">No history recorded yet.</td></tr>';
}} else {{
history.slice(-50).reverse().forEach(h => {{
const tr = document.createElement('tr');
tr.innerHTML =
'<td class="dim">' + (h.snapshot_at || h.recorded_at || '').substring(0, 10) + '</td>' +
'<td>' + escHtml(h.gap_topic) + '</td>' +
'<td><span class="sev-' + (h.severity || 'medium') + '">' + (h.severity || 'medium').toUpperCase() + '</span></td>' +
'<td>' + escHtml(h.status || 'open') + '</td>';
tbody.appendChild(tr);
}});
}}
}}
init();
</script>
</body>
</html>"""
(self.output_dir / "observatory" / "gaps.html").write_text(html)
# ── Timeline page ───────────────────────────────────────────────────────
def _generate_timeline_page(self) -> None:
"""Submission timeline across sources."""
html = f"""\
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Timeline - Living Standards Observatory</title>
<link rel="stylesheet" href="../assets/style.css">
<style>
.tl-row {{ display: flex; align-items: center; gap: 8px; padding: 6px 0; border-bottom: 1px solid #f0f0f0; }}
.tl-month {{ min-width: 80px; font-size: 0.82rem; color: var(--text-dim); font-family: monospace; }}
.tl-bars {{ flex: 1; display: flex; gap: 1px; align-items: center; }}
.tl-count {{ min-width: 30px; text-align: right; font-size: 0.78rem; color: var(--text-dim); }}
.legend {{ display: flex; gap: 16px; flex-wrap: wrap; margin-bottom: 16px; }}
.legend-item {{ display: flex; align-items: center; gap: 4px; font-size: 0.8rem; }}
.legend-swatch {{ width: 14px; height: 14px; border-radius: 3px; }}
.view-toggle {{ display: flex; gap: 8px; margin-bottom: 16px; }}
.view-btn {{ padding: 6px 16px; border: 1px solid var(--border); border-radius: 6px; background: var(--card-bg); cursor: pointer; font-size: 0.82rem; }}
.view-btn.active {{ background: var(--accent); color: #fff; border-color: var(--accent); }}
</style>
</head>
<body>
{self._header_html("timeline")}
<div class="container">
<h2 style="margin-bottom:8px">Submission Timeline</h2>
<p class="dim" style="margin-bottom:20px">Monthly document submissions across standards bodies and categories.</p>
<div class="view-toggle">
<button class="view-btn active" id="btnSource" onclick="setView('source')">By Source</button>
<button class="view-btn" id="btnCategory" onclick="setView('category')">By Category</button>
</div>
<div class="legend" id="legend"></div>
<div class="chart-container" id="timeline"></div>
<div class="panel">
<div class="panel-header">Monthly Totals</div>
<table>
<thead><tr><th>Month</th><th>Total</th><th id="breakdownHeader">By Source</th></tr></thead>
<tbody id="monthTable"></tbody>
</table>
</div>
</div>
<script>
function escHtml(s) {{ const d = document.createElement('div'); d.textContent = s || ''; return d.innerHTML; }}
const COLORS_SOURCE = {{'ietf': '#4a6cf7', 'w3c': '#ef4444', 'ieee': '#10b981', 'other': '#9ca3af'}};
const COLORS_CAT = [
'#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A',
'#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52',
'#7C8CF5', '#FF8C69', '#66CDAA', '#BA55D3', '#FFD700',
];
let TL_DATA = null;
let currentView = 'source';
function setView(view) {{
currentView = view;
document.getElementById('btnSource').className = 'view-btn' + (view === 'source' ? ' active' : '');
document.getElementById('btnCategory').className = 'view-btn' + (view === 'category' ? ' active' : '');
document.getElementById('breakdownHeader').textContent = view === 'source' ? 'By Source' : 'By Category';
renderTimeline();
}}
function renderTimeline() {{
if (!TL_DATA) return;
const months = TL_DATA.months;
const isSource = currentView === 'source';
const dataMap = isSource ? TL_DATA.by_source : TL_DATA.by_category;
const keys = isSource ? TL_DATA.sources : TL_DATA.categories;
// Assign colors
const colorMap = {{}};
if (isSource) {{
keys.forEach(k => {{ colorMap[k] = COLORS_SOURCE[k] || '#9ca3af'; }});
}} else {{
keys.forEach((k, i) => {{ colorMap[k] = COLORS_CAT[i % COLORS_CAT.length]; }});
}}
// Max for scaling
let maxTotal = 0;
months.forEach(m => {{
const d = dataMap[m] || {{}};
let t = 0;
keys.forEach(k => {{ t += d[k] || 0; }});
if (t > maxTotal) maxTotal = t;
}});
const scale = maxTotal > 0 ? 500 / maxTotal : 1;
// Legend
const legendEl = document.getElementById('legend');
legendEl.innerHTML = '';
keys.forEach(k => {{
legendEl.innerHTML += '<div class="legend-item"><div class="legend-swatch" style="background:' + colorMap[k] + '"></div>' + escHtml(k) + '</div>';
}});
// Chart
const container = document.getElementById('timeline');
container.innerHTML = '';
months.forEach(m => {{
const d = dataMap[m] || {{}};
let total = 0;
keys.forEach(k => {{ total += d[k] || 0; }});
let barsHtml = '';
keys.forEach(k => {{
const v = d[k] || 0;
if (v > 0) {{
const w = Math.max(v * scale, 2);
barsHtml += '<div class="tl-bar" style="width:' + w + 'px;background:' + colorMap[k] + '" title="' + escHtml(k) + ': ' + v + '"></div>';
}}
}});
container.innerHTML += '<div class="tl-row"><span class="tl-month">' + m + '</span><div class="tl-bars">' + barsHtml + '</div><span class="tl-count">' + total + '</span></div>';
}});
// Table
const tbody = document.getElementById('monthTable');
tbody.innerHTML = '';
[...months].reverse().forEach(m => {{
const d = dataMap[m] || {{}};
let total = 0;
const parts = [];
keys.forEach(k => {{
const v = d[k] || 0;
total += v;
if (v > 0) parts.push(k + ': ' + v);
}});
if (total > 0) {{
const tr = document.createElement('tr');
tr.innerHTML = '<td class="dim">' + m + '</td><td>' + total + '</td><td class="dim">' + parts.join(', ') + '</td>';
tbody.appendChild(tr);
}}
}});
}}
async function init() {{
TL_DATA = await fetch('../data/timeline.json').then(r => r.json());
renderTimeline();
}}
init();
</script>
</body>
</html>"""
(self.output_dir / "observatory" / "timeline.html").write_text(html)

View File

@@ -10,7 +10,7 @@ from pathlib import Path
import numpy as np
from .config import Config
from .models import Author, Draft, Rating
from .models import Author, Draft, Rating, normalize_category
SCHEMA = """
CREATE TABLE IF NOT EXISTS drafts (
@@ -117,6 +117,73 @@ CREATE TABLE IF NOT EXISTS gaps (
analyzed_at TEXT
);
-- Cross-references (RFC, draft, BCP references found in draft text)
CREATE TABLE IF NOT EXISTS draft_refs (
draft_name TEXT NOT NULL REFERENCES drafts(name),
ref_type TEXT NOT NULL, -- 'rfc', 'draft', 'bcp'
ref_id TEXT NOT NULL, -- e.g. '8259', 'draft-ietf-httpbis-semantics', 'BCP14'
UNIQUE(draft_name, ref_type, ref_id)
);
CREATE INDEX IF NOT EXISTS idx_draft_refs_ref ON draft_refs(ref_type, ref_id);
-- Generated drafts from gap-to-draft pipeline
CREATE TABLE IF NOT EXISTS generated_drafts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
gap_topic TEXT NOT NULL,
draft_name TEXT NOT NULL,
title TEXT NOT NULL,
abstract TEXT NOT NULL DEFAULT '',
outline_json TEXT DEFAULT '{}',
sections_json TEXT DEFAULT '[]',
full_text TEXT,
family_name TEXT DEFAULT '',
family_role TEXT DEFAULT '',
version INTEGER DEFAULT 0,
rating_json TEXT DEFAULT '{}',
novelty_score REAL DEFAULT 0.0,
quality_score REAL DEFAULT 0.0,
status TEXT DEFAULT 'draft',
created_at TEXT
);
CREATE TABLE IF NOT EXISTS generation_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
family_name TEXT DEFAULT '',
gap_ids TEXT DEFAULT '[]',
total_input_tokens INTEGER DEFAULT 0,
total_output_tokens INTEGER DEFAULT 0,
model_used TEXT DEFAULT '',
status TEXT DEFAULT 'running',
started_at TEXT,
completed_at TEXT
);
-- Observatory tables
CREATE TABLE IF NOT EXISTS sources (
name TEXT PRIMARY KEY,
last_fetch TEXT,
doc_count INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS observatory_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_at TEXT NOT NULL,
total_docs INTEGER DEFAULT 0,
new_since_last INTEGER DEFAULT 0,
changed_gaps INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS gap_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_id INTEGER REFERENCES observatory_snapshots(id),
gap_topic TEXT NOT NULL,
gap_description TEXT NOT NULL,
severity TEXT DEFAULT 'medium',
status TEXT DEFAULT 'open',
recorded_at TEXT
);
-- Triggers to keep FTS index in sync
CREATE TRIGGER IF NOT EXISTS drafts_ai AFTER INSERT ON drafts BEGIN
INSERT INTO drafts_fts(rowid, name, title, abstract, full_text)
@@ -152,8 +219,23 @@ class Database:
self._conn.execute("PRAGMA journal_mode=WAL")
self._conn.execute("PRAGMA foreign_keys=ON")
self._conn.executescript(SCHEMA)
self._migrate_schema()
return self._conn
def _migrate_schema(self) -> None:
"""Additive migration — add columns if missing."""
cols = {r[1] for r in self._conn.execute("PRAGMA table_info(drafts)").fetchall()}
migrations = [
("source", "TEXT DEFAULT 'ietf'"),
("source_id", "TEXT DEFAULT ''"),
("source_url", "TEXT DEFAULT ''"),
("doc_status", "TEXT DEFAULT ''"),
]
for col, typedef in migrations:
if col not in cols:
self._conn.execute(f"ALTER TABLE drafts ADD COLUMN {col} {typedef}")
self._conn.commit()
def close(self) -> None:
if self._conn:
self._conn.close()
@@ -303,7 +385,7 @@ class Database:
novelty_note=r["novelty_note"], maturity_note=r["maturity_note"],
overlap_note=r["overlap_note"], momentum_note=r["momentum_note"],
relevance_note=r["relevance_note"],
categories=json.loads(r["r_categories"]) if r["r_categories"] else [],
categories=[normalize_category(c) for c in json.loads(r["r_categories"])] if r["r_categories"] else [],
rated_at=r["rated_at"],
)
results.append((draft, rating))
@@ -503,6 +585,30 @@ class Database:
).fetchall()
return [(r["org_a"], r["org_b"], r["shared"]) for r in rows]
def org_data_raw(self) -> list[tuple[str, int, str]]:
"""Return (affiliation, person_id, draft_name) for all draft_authors with affiliation."""
rows = self.conn.execute(
"SELECT affiliation, person_id, draft_name FROM draft_authors WHERE affiliation != ''"
).fetchall()
return [(r[0], r[1], r[2]) for r in rows]
def author_draft_counts(self) -> dict[int, int]:
"""Return {person_id: draft_count} for all authors."""
rows = self.conn.execute(
"SELECT person_id, COUNT(*) FROM draft_authors GROUP BY person_id"
).fetchall()
return {r[0]: r[1] for r in rows}
def author_draft_sets(self) -> dict[int, set[str]]:
"""Return {person_id: set(draft_names)} for all authors."""
rows = self.conn.execute(
"SELECT person_id, draft_name FROM draft_authors"
).fetchall()
result: dict[int, set[str]] = {}
for r in rows:
result.setdefault(r[0], set()).add(r[1])
return result
# --- Ideas ---
def insert_ideas(self, draft_name: str, ideas: list[dict]) -> None:
@@ -529,7 +635,9 @@ class Database:
rows = self.conn.execute(
"""SELECT d.name FROM drafts d
LEFT JOIN ideas i ON d.name = i.draft_name
WHERE i.draft_name IS NULL
LEFT JOIN llm_cache lc ON d.name = lc.draft_name
AND lc.request_json LIKE 'batch-ideas[%'
WHERE i.draft_name IS NULL AND lc.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
@@ -565,6 +673,314 @@ class Database:
"category": r["category"], "evidence": r["evidence"],
"severity": r["severity"]} for r in rows]
# --- Refs ---
def insert_refs(self, draft_name: str, refs: list[tuple[str, str]]) -> None:
"""Insert cross-references for a draft. refs = [(ref_type, ref_id), ...]."""
for ref_type, ref_id in refs:
self.conn.execute(
"""INSERT OR IGNORE INTO draft_refs (draft_name, ref_type, ref_id)
VALUES (?, ?, ?)""",
(draft_name, ref_type, ref_id),
)
self.conn.commit()
def get_refs_for_draft(self, draft_name: str) -> list[tuple[str, str]]:
"""Return [(ref_type, ref_id)] for a draft."""
rows = self.conn.execute(
"SELECT ref_type, ref_id FROM draft_refs WHERE draft_name = ?",
(draft_name,),
).fetchall()
return [(r["ref_type"], r["ref_id"]) for r in rows]
def top_referenced(self, ref_type: str = "rfc", limit: int = 30) -> list[tuple[str, int, list[str]]]:
"""Return (ref_id, count, [draft_names]) for most-referenced items."""
rows = self.conn.execute(
"""SELECT ref_id, COUNT(*) as cnt,
GROUP_CONCAT(draft_name, '||') as drafts
FROM draft_refs
WHERE ref_type = ?
GROUP BY ref_id
ORDER BY cnt DESC
LIMIT ?""",
(ref_type, limit),
).fetchall()
return [
(r["ref_id"], r["cnt"], r["drafts"].split("||") if r["drafts"] else [])
for r in rows
]
def drafts_referencing(self, ref_type: str, ref_id: str) -> list[str]:
"""Return draft names that reference a specific RFC/draft/BCP."""
rows = self.conn.execute(
"SELECT draft_name FROM draft_refs WHERE ref_type = ? AND ref_id = ?",
(ref_type, ref_id),
).fetchall()
return [r["draft_name"] for r in rows]
def ref_counts_by_draft(self) -> list[tuple[str, int, int, int]]:
"""Return (draft_name, rfc_count, draft_count, bcp_count) for all drafts with refs."""
rows = self.conn.execute(
"""SELECT draft_name,
SUM(CASE WHEN ref_type = 'rfc' THEN 1 ELSE 0 END) as rfcs,
SUM(CASE WHEN ref_type = 'draft' THEN 1 ELSE 0 END) as drafts,
SUM(CASE WHEN ref_type = 'bcp' THEN 1 ELSE 0 END) as bcps
FROM draft_refs
GROUP BY draft_name
ORDER BY rfcs DESC"""
).fetchall()
return [(r["draft_name"], r["rfcs"], r["drafts"], r["bcps"]) for r in rows]
def drafts_without_refs(self, limit: int = 500) -> list[str]:
"""Return draft names that have full_text but no refs extracted yet."""
rows = self.conn.execute(
"""SELECT d.name FROM drafts d
LEFT JOIN draft_refs dr ON d.name = dr.draft_name
WHERE d.full_text IS NOT NULL AND dr.draft_name IS NULL
LIMIT ?""",
(limit,),
).fetchall()
return [r["name"] for r in rows]
def ref_stats(self) -> dict:
"""Return summary stats for refs table."""
row = self.conn.execute(
"""SELECT COUNT(DISTINCT draft_name) as drafts_with_refs,
COUNT(*) as total_refs,
SUM(CASE WHEN ref_type = 'rfc' THEN 1 ELSE 0 END) as rfc_refs,
SUM(CASE WHEN ref_type = 'draft' THEN 1 ELSE 0 END) as draft_refs,
SUM(CASE WHEN ref_type = 'bcp' THEN 1 ELSE 0 END) as bcp_refs,
COUNT(DISTINCT ref_id) as unique_refs
FROM draft_refs"""
).fetchone()
return dict(row)
# --- Generated Drafts ---
def upsert_generated_draft(self, data: dict) -> int:
"""Insert or update a generated draft. Returns row id."""
now = datetime.now(timezone.utc).isoformat()
existing = self.conn.execute(
"SELECT id FROM generated_drafts WHERE draft_name = ? AND version = ?",
(data["draft_name"], data.get("version", 0)),
).fetchone()
if existing:
self.conn.execute(
"""UPDATE generated_drafts SET
gap_topic=?, title=?, abstract=?, outline_json=?,
sections_json=?, full_text=?, family_name=?, family_role=?,
rating_json=?, novelty_score=?, quality_score=?, status=?
WHERE id=?""",
(data["gap_topic"], data["title"], data.get("abstract", ""),
json.dumps(data.get("outline", {})), json.dumps(data.get("sections", [])),
data.get("full_text"), data.get("family_name", ""),
data.get("family_role", ""), json.dumps(data.get("rating", {})),
data.get("novelty_score", 0.0), data.get("quality_score", 0.0),
data.get("status", "draft"), existing["id"]),
)
self.conn.commit()
return existing["id"]
else:
cur = self.conn.execute(
"""INSERT INTO generated_drafts
(gap_topic, draft_name, title, abstract, outline_json, sections_json,
full_text, family_name, family_role, version, rating_json,
novelty_score, quality_score, status, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(data["gap_topic"], data["draft_name"], data["title"],
data.get("abstract", ""), json.dumps(data.get("outline", {})),
json.dumps(data.get("sections", [])), data.get("full_text"),
data.get("family_name", ""), data.get("family_role", ""),
data.get("version", 0), json.dumps(data.get("rating", {})),
data.get("novelty_score", 0.0), data.get("quality_score", 0.0),
data.get("status", "draft"), now),
)
self.conn.commit()
return cur.lastrowid
def get_generated_drafts(self, status: str | None = None) -> list[dict]:
query = "SELECT * FROM generated_drafts"
params: list = []
if status:
query += " WHERE status = ?"
params.append(status)
query += " ORDER BY created_at DESC"
rows = self.conn.execute(query, params).fetchall()
return [dict(r) for r in rows]
def get_generated_draft(self, draft_id: int) -> dict | None:
row = self.conn.execute(
"SELECT * FROM generated_drafts WHERE id = ?", (draft_id,)
).fetchone()
return dict(row) if row else None
def get_family_drafts(self, family_name: str) -> list[dict]:
rows = self.conn.execute(
"SELECT * FROM generated_drafts WHERE family_name = ? ORDER BY family_role",
(family_name,),
).fetchall()
return [dict(r) for r in rows]
def log_generation_run(self, data: dict) -> int:
now = datetime.now(timezone.utc).isoformat()
cur = self.conn.execute(
"""INSERT INTO generation_runs
(family_name, gap_ids, total_input_tokens, total_output_tokens,
model_used, status, started_at)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
(data.get("family_name", ""), json.dumps(data.get("gap_ids", [])),
data.get("total_input_tokens", 0), data.get("total_output_tokens", 0),
data.get("model_used", ""), data.get("status", "running"), now),
)
self.conn.commit()
return cur.lastrowid
def update_generation_run(self, run_id: int, **kwargs) -> None:
sets = []
params = []
for k, v in kwargs.items():
sets.append(f"{k} = ?")
params.append(v)
if not sets:
return
params.append(run_id)
self.conn.execute(
f"UPDATE generation_runs SET {', '.join(sets)} WHERE id = ?", params
)
self.conn.commit()
# --- Observatory ---
def upsert_source(self, name: str, doc_count: int = 0) -> None:
now = datetime.now(timezone.utc).isoformat()
self.conn.execute(
"""INSERT INTO sources (name, last_fetch, doc_count)
VALUES (?, ?, ?)
ON CONFLICT(name) DO UPDATE SET last_fetch=excluded.last_fetch, doc_count=excluded.doc_count""",
(name, now, doc_count),
)
self.conn.commit()
def get_source(self, name: str) -> dict | None:
row = self.conn.execute("SELECT * FROM sources WHERE name = ?", (name,)).fetchone()
return dict(row) if row else None
def all_sources(self) -> list[dict]:
rows = self.conn.execute("SELECT * FROM sources ORDER BY name").fetchall()
return [dict(r) for r in rows]
def create_snapshot(self) -> int:
now = datetime.now(timezone.utc).isoformat()
total = self.count_drafts()
# Count new since last snapshot
last = self.conn.execute(
"SELECT snapshot_at FROM observatory_snapshots ORDER BY id DESC LIMIT 1"
).fetchone()
new_count = 0
if last:
new_count = self.conn.execute(
"SELECT COUNT(*) FROM drafts WHERE fetched_at > ?", (last["snapshot_at"],)
).fetchone()[0]
else:
new_count = total
cur = self.conn.execute(
"""INSERT INTO observatory_snapshots (snapshot_at, total_docs, new_since_last, changed_gaps)
VALUES (?, ?, ?, 0)""",
(now, total, new_count),
)
self.conn.commit()
return cur.lastrowid
def record_gap_history(self, snapshot_id: int, gaps: list[dict]) -> None:
now = datetime.now(timezone.utc).isoformat()
for g in gaps:
self.conn.execute(
"""INSERT INTO gap_history (snapshot_id, gap_topic, gap_description, severity, status, recorded_at)
VALUES (?, ?, ?, ?, ?, ?)""",
(snapshot_id, g["topic"], g["description"],
g.get("severity", "medium"), g.get("status", "open"), now),
)
self.conn.commit()
def gap_history_timeline(self) -> list[dict]:
rows = self.conn.execute(
"""SELECT gh.*, os.snapshot_at FROM gap_history gh
JOIN observatory_snapshots os ON gh.snapshot_id = os.id
ORDER BY os.snapshot_at, gh.gap_topic"""
).fetchall()
return [dict(r) for r in rows]
def get_snapshots(self, limit: int = 20) -> list[dict]:
rows = self.conn.execute(
"SELECT * FROM observatory_snapshots ORDER BY id DESC LIMIT ?", (limit,)
).fetchall()
return [dict(r) for r in rows]
def drafts_by_source(self, source: str, limit: int = 500) -> list[Draft]:
rows = self.conn.execute(
"SELECT * FROM drafts WHERE source = ? ORDER BY time DESC LIMIT ?",
(source, limit),
).fetchall()
return [self._row_to_draft(r) for r in rows]
# --- WG/Status ---
def draft_adoption_status(self) -> list[dict]:
"""Return adoption status for all drafts based on naming convention.
Returns list of dicts: {name, title, time, wg_adopted, wg_name, stream}
"""
import re
rows = self.conn.execute(
'SELECT name, title, time FROM drafts'
).fetchall()
results = []
for r in rows:
name = r["name"]
wg_adopted = False
wg_name = ""
stream = "individual"
# Primary signal: draft-ietf-{wg}-* naming convention
m = re.match(r'^draft-ietf-(\w+)-', name)
if m:
wg_adopted = True
wg_name = m.group(1)
stream = "ietf"
elif name.startswith("draft-irtf-"):
m2 = re.match(r'^draft-irtf-(\w+)-', name)
wg_name = m2.group(1) if m2 else ""
stream = "irtf"
results.append({
"name": name,
"title": r["title"],
"time": r["time"],
"wg_adopted": wg_adopted,
"wg_name": wg_name,
"stream": stream,
})
return results
def revision_velocity(self) -> list[dict]:
"""Return revision data for all drafts.
Returns list of dicts: {name, title, time, rev, rev_int}
"""
rows = self.conn.execute(
"SELECT name, title, time, rev FROM drafts"
).fetchall()
return [
{
"name": r["name"],
"title": r["title"],
"time": r["time"],
"rev": r["rev"],
"rev_int": int(r["rev"]) if r["rev"].isdigit() else 0,
}
for r in rows
]
# --- Helpers ---
@staticmethod
@@ -580,11 +996,16 @@ class Database:
categories=json.loads(d.get("categories") or "[]"),
tags=json.loads(d.get("tags") or "[]"),
fetched_at=d.get("fetched_at"),
source=d.get("source", "ietf"),
source_id=d.get("source_id", ""),
source_url=d.get("source_url", ""),
doc_status=d.get("doc_status", ""),
)
@staticmethod
def _row_to_rating(row: sqlite3.Row) -> Rating:
d = dict(row)
raw_cats = json.loads(d.get("categories") or "[]")
return Rating(
draft_name=d["draft_name"], novelty=d["novelty"], maturity=d["maturity"],
overlap=d["overlap"], momentum=d["momentum"], relevance=d["relevance"],
@@ -594,6 +1015,6 @@ class Database:
overlap_note=d.get("overlap_note", ""),
momentum_note=d.get("momentum_note", ""),
relevance_note=d.get("relevance_note", ""),
categories=json.loads(d.get("categories") or "[]"),
categories=[normalize_category(c) for c in raw_cats],
rated_at=d.get("rated_at"),
)

View File

@@ -5,6 +5,24 @@ from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
# Map old verbose category names to canonical short names
CATEGORY_NORMALIZE: dict[str, str] = {
"Agent-to-agent communication protocols": "A2A protocols",
"AI safety / guardrails / alignment": "AI safety/alignment",
"ML-based traffic management / optimization": "ML traffic mgmt",
"Autonomous network operations": "Autonomous netops",
"Identity / authentication for AI agents": "Agent identity/auth",
"Data formats / semantics for AI interop": "Data formats/interop",
"Policy / governance / ethical frameworks": "Policy/governance",
"AI model serving / inference protocols": "Model serving/inference",
"Agent discovery / registration": "Agent discovery/reg",
}
def normalize_category(cat: str) -> str:
"""Normalize a category name to its canonical short form."""
return CATEGORY_NORMALIZE.get(cat, cat)
@dataclass
class Author:
@@ -36,6 +54,10 @@ class Draft:
categories: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
fetched_at: str | None = None
source: str = "ietf"
source_id: str = ""
source_url: str = ""
doc_status: str = ""
@property
def text_url(self) -> str:

View File

@@ -0,0 +1,286 @@
"""Observatory — orchestrates periodic update cycles across sources."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from rich.console import Console
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
SpinnerColumn,
TextColumn,
)
from .config import Config
from .db import Database
from .models import Draft
from .sources import get_fetcher
from .sources.base import SourceDocument
console = Console()
def _doc_to_draft(doc: SourceDocument) -> Draft:
"""Convert a SourceDocument to a Draft for DB storage."""
extra = doc.extra or {}
return Draft(
name=doc.name,
rev=extra.get("rev", "00"),
title=doc.title,
abstract=doc.abstract,
time=doc.time,
dt_id=int(doc.source_id) if doc.source_id and doc.source_id.isdigit() else None,
pages=extra.get("pages"),
words=extra.get("words"),
group=extra.get("group"),
group_uri=extra.get("group_uri"),
expires=extra.get("expires"),
ad=extra.get("ad"),
shepherd=extra.get("shepherd"),
states=extra.get("states", []),
full_text=doc.full_text,
fetched_at=datetime.now(timezone.utc).isoformat(),
source=doc.source,
source_id=doc.source_id,
source_url=doc.source_url,
doc_status=doc.doc_status,
)
class Observatory:
"""Orchestrates the full observatory update cycle."""
def __init__(
self,
config: Config | None = None,
db: Database | None = None,
analyzer=None,
):
self.config = config or Config.load()
self.db = db or Database(self.config)
self._analyzer = analyzer
@property
def analyzer(self):
"""Lazy-load analyzer to avoid Anthropic key requirement for status/diff."""
if self._analyzer is None:
from .analyzer import Analyzer
self._analyzer = Analyzer(self.config, self.db)
return self._analyzer
def update(
self,
sources: list[str] | None = None,
full: bool = False,
) -> dict:
"""Full update cycle.
1. Snapshot current state
2. Fetch from enabled sources (delta by default)
3. Analyze unrated docs (Claude, with caching)
4. Embed missing docs (Ollama)
5. Extract ideas from new docs
6. Re-run gap analysis if >= 5 new docs
7. Record gap changes in gap_history
8. Return summary stats
"""
sources = sources or self.config.observatory_sources
stats: dict = {"sources": {}, "new_docs": 0, "analyzed": 0, "embedded": 0, "ideas": 0, "gaps_changed": False}
# 1. Snapshot current state
console.print("[bold]1/7[/] Creating snapshot...")
snapshot_id = self.db.create_snapshot()
# 2. Fetch from enabled sources
console.print("[bold]2/7[/] Fetching from sources...")
total_new = 0
for src_name in sources:
new_count = self._fetch_source(src_name, full=full)
stats["sources"][src_name] = new_count
total_new += new_count
stats["new_docs"] = total_new
console.print(f" Fetched [bold green]{total_new}[/] new documents total")
# 3. Analyze unrated docs
console.print("[bold]3/7[/] Analyzing unrated documents...")
analyzed = self.analyzer.rate_all_unrated(limit=200, batch_size=5)
stats["analyzed"] = analyzed
# 4. Embed missing docs
console.print("[bold]4/7[/] Embedding missing documents...")
embedded = self._embed_missing()
stats["embedded"] = embedded
# 5. Extract ideas from new docs
console.print("[bold]5/7[/] Extracting ideas...")
ideas = self.analyzer.extract_all_ideas(limit=200, batch_size=5, cheap=True)
stats["ideas"] = ideas
# 6. Re-run gap analysis if enough new docs
if total_new >= 5:
console.print("[bold]6/7[/] Re-running gap analysis...")
gaps = self.analyzer.gap_analysis()
if gaps:
self.db.record_gap_history(snapshot_id, gaps)
stats["gaps_changed"] = True
console.print(f" Found [bold]{len(gaps)}[/] gaps")
else:
console.print(f"[bold]6/7[/] Skipping gap analysis ({total_new} < 5 new docs)")
# Record current gaps unchanged
current_gaps = self.db.all_gaps()
if current_gaps:
self.db.record_gap_history(snapshot_id, current_gaps)
# 7. Update source records
console.print("[bold]7/7[/] Updating source records...")
for src_name in sources:
count = len(self.db.drafts_by_source(src_name, limit=10000))
self.db.upsert_source(src_name, doc_count=count)
console.print("\n[bold green]Observatory update complete![/]")
console.print(f" New docs: {total_new} | Analyzed: {analyzed} | Embedded: {embedded} | Ideas: {ideas}")
return stats
def _fetch_source(self, source_name: str, full: bool = False) -> int:
"""Fetch documents from a single source. Returns count of new docs."""
fetcher = get_fetcher(source_name, self.config)
try:
# Delta fetch: only since last fetch unless full=True
since = None
if not full:
src = self.db.get_source(source_name)
if src and src.get("last_fetch"):
since = src["last_fetch"][:10] # Date portion only
docs = fetcher.search(self.config.search_keywords, since=since)
new_count = 0
for doc in docs:
existing = self.db.get_draft(doc.name)
if existing is None:
new_count += 1
draft = _doc_to_draft(doc)
self.db.upsert_draft(draft)
# Download text for docs missing it
missing_text = [
d for d in docs
if self.db.get_draft(d.name) and self.db.get_draft(d.name).full_text is None
]
if missing_text:
console.print(f" Downloading text for {len(missing_text)} {source_name} docs...")
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task(f"Downloading {source_name} texts...", total=len(missing_text))
for doc in missing_text:
text = fetcher.download_text(doc)
if text:
draft = self.db.get_draft(doc.name)
if draft:
draft.full_text = text
self.db.upsert_draft(draft)
progress.advance(task)
return new_count
finally:
fetcher.close()
def _embed_missing(self) -> int:
"""Embed documents that don't have embeddings yet."""
missing = self.db.drafts_without_embeddings(limit=500)
if not missing:
console.print(" All documents already embedded.")
return 0
try:
from .embeddings import Embedder
embedder = Embedder(self.config, self.db)
except Exception as e:
console.print(f" [yellow]Skipping embeddings (Ollama unavailable): {e}[/]")
return 0
count = 0
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task("Embedding...", total=len(missing))
for name in missing:
try:
vec = embedder.embed_draft(name)
if vec is not None:
count += 1
except Exception:
pass
progress.advance(task)
console.print(f" Embedded [bold green]{count}[/] documents")
return count
def status(self) -> dict:
"""Current observatory state -- doc counts, sources, last update."""
total = self.db.count_drafts()
sources = self.db.all_sources()
snapshots = self.db.get_snapshots(limit=1)
gaps = self.db.all_gaps()
# Count by source
source_counts = {}
for src in sources:
source_counts[src["name"]] = src["doc_count"]
# Unrated / unembedded
unrated = len(self.db.unrated_drafts(limit=10000))
unembedded = len(self.db.drafts_without_embeddings(limit=10000))
last_update = snapshots[0]["snapshot_at"] if snapshots else None
return {
"total_docs": total,
"sources": source_counts,
"unrated": unrated,
"unembedded": unembedded,
"gaps": len(gaps),
"last_update": last_update,
"snapshots": len(self.db.get_snapshots(limit=100)),
}
def diff(self, since: str | None = None) -> dict:
"""What changed since a date -- new docs, gap changes."""
if since is None:
# Default to last snapshot
snapshots = self.db.get_snapshots(limit=2)
if len(snapshots) >= 2:
since = snapshots[1]["snapshot_at"]
else:
since = "2000-01-01"
# New docs since date
new_docs = self.db.conn.execute(
"SELECT name, title, source, time FROM drafts WHERE fetched_at > ? ORDER BY time DESC",
(since,),
).fetchall()
# Gap changes
gap_timeline = self.db.gap_history_timeline()
recent_gaps = [g for g in gap_timeline if g.get("recorded_at", "") > since]
return {
"since": since,
"new_docs": [dict(r) for r in new_docs],
"new_doc_count": len(new_docs),
"gap_changes": recent_gaps,
}

291
src/ietf_analyzer/orgs.py Normal file
View File

@@ -0,0 +1,291 @@
"""Organization normalization and team bloc detection."""
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass, field
from .db import Database
# Maps raw affiliation strings to canonical org names.
# Built from SELECT DISTINCT affiliation FROM draft_authors.
ORG_ALIASES: dict[str, str] = {
# Huawei
"Huawei Technologies": "Huawei",
"Huawei Technologies Co., Ltd.": "Huawei",
"Huawei Technologies, Co., Ltd": "Huawei",
"Huawei Tech": "Huawei",
"Huawei Canada": "Huawei",
"Huawei R&D": "Huawei",
"Huawei Singapore": "Huawei",
# Cisco
"Cisco Systems": "Cisco",
"Cisco Systems, Inc.": "Cisco",
# Ericsson
"Ericsson AB": "Ericsson",
# RISE
"RISE AB": "RISE",
"RISE": "RISE",
# Independent
"Independent Researcher": "Independent",
"Unaffiliated": "Independent",
"Individual Contributor": "Independent",
# Inria
"INRIA": "Inria",
# Google
"Google LLC": "Google",
"Google": "Google",
# Apple
"Apple Inc": "Apple",
"Apple, Inc": "Apple",
"Apple": "Apple",
# Amazon
"Amazon Web Services": "Amazon",
"AWS": "Amazon",
"Amazon": "Amazon",
# Siemens
"Siemens AG": "Siemens",
# ZTE
"ZTE": "ZTE Corporation",
# Telefonica
"Telefonica I+D": "Telefonica",
# Deutsche Telekom
"Deutsche Telecom": "Deutsche Telekom",
# InterDigital
"InterDigital Europe Ltd.": "InterDigital Europe",
# Boeing
"Boeing Technology Innovation": "Boeing",
"Boeing Research & Technology": "Boeing",
# Futurewei
"Futurewei Technologies USA": "Futurewei",
"Futurewei": "Futurewei",
# IBM
"IBM Research": "IBM",
"IBM": "IBM",
# China Telecom
"China Telecom Research Institute": "China Telecom",
# Beijing University (multiline variant from Datatracker)
"Beijing University of Posts and\n Telecommunications": "BUPT",
"Beijing University of Posts and Telecommunications": "BUPT",
# AsiaInfo
"AsiaInfo Technologies (China) Inc.": "AsiaInfo",
"AsiaInfo Technologies (China) Inc": "AsiaInfo",
# Dept of CS
"Department of Computer Science and Engineering": "Department of Computer Science & Engineering",
}
# Common suffixes to strip for fuzzy matching
_SUFFIXES = [
", Inc.", ", Inc", " Inc.", " Inc",
" LLC", " Ltd.", " Ltd",
" AB", " GmbH", " Corp",
" Co., Ltd.", " Co., Ltd",
" Technologies",
]
def normalize_org(raw: str) -> str:
"""Normalize an affiliation string to a canonical org name."""
raw = raw.strip()
if not raw:
return ""
# Exact match
if raw in ORG_ALIASES:
return ORG_ALIASES[raw]
# Fuzzy: strip suffixes and check again
stripped = raw
for suffix in _SUFFIXES:
if stripped.endswith(suffix):
stripped = stripped[: -len(suffix)].strip()
break
if stripped in ORG_ALIASES:
return ORG_ALIASES[stripped]
if stripped != raw and stripped:
# Check if the stripped form matches a canonical name directly
for canonical in set(ORG_ALIASES.values()):
if stripped.lower() == canonical.lower():
return canonical
return raw
@dataclass
class Bloc:
"""A team of authors who consistently co-author together."""
members: list[tuple[int, str, str]] # (person_id, name, normalized_org)
shared_drafts: int # drafts where >= 2 members co-author
primary_org: str
cohesion: float # avg pairwise cohesion
@property
def member_pids(self) -> set[int]:
return {pid for pid, _, _ in self.members}
@property
def label(self) -> str:
return f"{self.primary_org} team ({len(self.members)})"
def detect_blocs(
db: Database,
cohesion_threshold: float = 0.70,
min_size: int = 2,
min_shared_drafts: int = 2,
) -> list[Bloc]:
"""Detect team blocs where all member pairs share >= threshold of drafts.
Uses connected components on a cohesion-filtered co-author graph,
then merges overlapping groups into single blocs.
"""
draft_counts = db.author_draft_counts()
draft_sets = db.author_draft_sets()
# Get enriched pair data with person_ids
rows = db.conn.execute(
"""SELECT a1.name, da1.person_id, a2.name, da2.person_id, COUNT(*) as shared
FROM draft_authors da1
JOIN draft_authors da2 ON da1.draft_name = da2.draft_name
AND da1.person_id < da2.person_id
JOIN authors a1 ON da1.person_id = a1.person_id
JOIN authors a2 ON da2.person_id = a2.person_id
GROUP BY da1.person_id, da2.person_id
HAVING shared >= ?
ORDER BY shared DESC""",
(min_shared_drafts,),
).fetchall()
# Get affiliations per person
aff_rows = db.conn.execute(
"SELECT person_id, affiliation FROM authors"
).fetchall()
person_aff = {r[0]: normalize_org(r[1]) for r in aff_rows}
person_name: dict[int, str] = {}
# Build cohesion-filtered adjacency: only keep edges with high overlap
adj: dict[int, set[int]] = defaultdict(set)
pair_shared: dict[tuple[int, int], int] = {}
pair_cohesion: dict[tuple[int, int], float] = {}
for r in rows:
name_a, pid_a, name_b, pid_b, shared = r[0], r[1], r[2], r[3], r[4]
person_name[pid_a] = name_a
person_name[pid_b] = name_b
min_d = min(draft_counts.get(pid_a, 1), draft_counts.get(pid_b, 1))
cohesion = shared / min_d
if cohesion >= cohesion_threshold:
adj[pid_a].add(pid_b)
adj[pid_b].add(pid_a)
key = (min(pid_a, pid_b), max(pid_a, pid_b))
pair_shared[key] = shared
pair_cohesion[key] = cohesion
# Find connected components (each component = one merged bloc)
visited: set[int] = set()
components: list[set[int]] = []
for pid in adj:
if pid in visited:
continue
component: set[int] = set()
stack = [pid]
while stack:
node = stack.pop()
if node in visited:
continue
visited.add(node)
component.add(node)
stack.extend(adj[node] - visited)
if len(component) >= min_size:
components.append(component)
# Build Bloc objects from components
blocs = []
for comp in components:
members = [
(pid, person_name.get(pid, "?"), person_aff.get(pid, ""))
for pid in comp
]
# Shared drafts = drafts where >= 2 bloc members appear
all_drafts: dict[str, int] = defaultdict(int)
for pid in comp:
for d in draft_sets.get(pid, set()):
all_drafts[d] += 1
shared_count = sum(1 for cnt in all_drafts.values() if cnt >= 2)
# Primary org = most common among members
org_counts: dict[str, int] = defaultdict(int)
for _, _, org in members:
if org:
org_counts[org] += 1
primary = max(org_counts, key=org_counts.get) if org_counts else ""
# Average pairwise cohesion (only for connected pairs)
edges = [
pair_cohesion[key]
for a in comp for b in comp if a < b
for key in [(a, b)] if key in pair_cohesion
]
avg_coh = sum(edges) / len(edges) if edges else 0
blocs.append(Bloc(
members=sorted(members, key=lambda m: -len(draft_sets.get(m[0], set()))),
shared_drafts=shared_count,
primary_org=primary,
cohesion=avg_coh,
))
# Sort: most shared drafts first (the interesting ones)
blocs.sort(key=lambda b: (-b.shared_drafts, -len(b.members)))
return blocs
def top_orgs_normalized(
db: Database, limit: int = 20
) -> list[tuple[str, int, int]]:
"""Return (canonical_org, unique_authors, unique_drafts) with merged orgs."""
raw = db.org_data_raw()
org_people: dict[str, set[int]] = defaultdict(set)
org_drafts: dict[str, set[str]] = defaultdict(set)
for aff, pid, draft_name in raw:
canonical = normalize_org(aff)
if canonical:
org_people[canonical].add(pid)
org_drafts[canonical].add(draft_name)
results = [
(org, len(org_people[org]), len(org_drafts[org]))
for org in org_people
]
results.sort(key=lambda x: -x[2])
return results[:limit]
def cross_org_normalized(
db: Database, limit: int = 20
) -> list[tuple[str, str, int]]:
"""Return (org_a, org_b, shared_drafts) with normalized org names."""
# Get all (aff_a, aff_b, draft_name) cross-org triples
rows = db.conn.execute(
"""SELECT da1.affiliation, da2.affiliation, da1.draft_name
FROM draft_authors da1
JOIN draft_authors da2 ON da1.draft_name = da2.draft_name
AND da1.person_id < da2.person_id
WHERE da1.affiliation != '' AND da2.affiliation != ''"""
).fetchall()
pair_drafts: dict[tuple[str, str], set[str]] = defaultdict(set)
for aff_a, aff_b, draft_name in rows:
norm_a = normalize_org(aff_a)
norm_b = normalize_org(aff_b)
if norm_a and norm_b and norm_a != norm_b:
key = tuple(sorted([norm_a, norm_b]))
pair_drafts[key].add(draft_name)
results = [
(org_a, org_b, len(drafts))
for (org_a, org_b), drafts in pair_drafts.items()
]
results.sort(key=lambda x: -x[2])
return results[:limit]

View File

@@ -0,0 +1,6 @@
"""Gap-to-Draft generation pipeline."""
from .context import ContextBuilder
from .generator import PipelineGenerator
from .quality import QualityGates
from .family import FamilyCoordinator
from .formatter import DraftFormatter

View File

@@ -0,0 +1,259 @@
"""Context builder — assembles rich context for draft generation from DB queries."""
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
from rich.console import Console
from ..config import Config
from ..db import Database
console = Console()
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
dot = np.dot(a, b)
norm = np.linalg.norm(a) * np.linalg.norm(b)
if norm == 0:
return 0.0
return float(dot / norm)
class ContextBuilder:
def __init__(self, config: Config, db: Database):
self.config = config
self.db = db
def build_context(self, gap_topic: str) -> dict:
"""Assemble full context for a gap topic. All DB queries, zero Claude calls."""
gap = self._find_gap(gap_topic)
if not gap:
console.print(f"[yellow]No gap found matching '{gap_topic}', using topic as-is[/]")
gap = {
"id": 0,
"topic": gap_topic,
"description": gap_topic,
"category": "",
"evidence": "",
"severity": "medium",
}
ideas = self._convergent_ideas(gap)
rfcs = self._rfc_foundations(gap.get("category", ""))
similar = self._similar_drafts(gap["description"])
top_rated = self._top_rated_in_category(gap.get("category", ""))
wg_context = self._wg_context()
ecosystem = self._ecosystem_vision()
siblings = self._sibling_context(gap_topic)
return {
"gap": gap,
"convergent_ideas": ideas,
"rfc_foundations": rfcs,
"similar_drafts": similar,
"top_rated": top_rated,
"wg_context": wg_context,
"ecosystem_vision": ecosystem,
"sibling_context": siblings,
}
def _find_gap(self, topic: str) -> dict | None:
"""Find a gap by topic string (fuzzy match)."""
gaps = self.db.all_gaps()
topic_lower = topic.lower()
# Exact match first
for g in gaps:
if g["topic"].lower() == topic_lower:
return g
# Substring match
for g in gaps:
if topic_lower in g["topic"].lower() or topic_lower in g["description"].lower():
return g
# Word overlap match
topic_words = set(topic_lower.split())
best = None
best_score = 0
for g in gaps:
gap_words = set(g["topic"].lower().split()) | set(g["description"].lower().split())
overlap = len(topic_words & gap_words)
if overlap > best_score:
best_score = overlap
best = g
return best if best_score >= 2 else None
def _convergent_ideas(self, gap: dict, limit: int = 20) -> list[dict]:
"""Find ideas that converge on this gap topic via keyword matching."""
all_ideas = self.db.all_ideas()
if not all_ideas:
return []
# Build search terms from gap topic + description
search_text = (gap["topic"] + " " + gap["description"]).lower()
search_words = set(search_text.split())
# Remove common words
stop_words = {"the", "a", "an", "and", "or", "in", "of", "for", "to", "is",
"are", "that", "this", "with", "not", "by", "on", "at", "from",
"as", "be", "it", "no", "but", "has", "have", "do", "does"}
search_words -= stop_words
scored = []
for idea in all_ideas:
idea_text = (idea["title"] + " " + idea["description"]).lower()
idea_words = set(idea_text.split())
overlap = len(search_words & idea_words)
if overlap >= 1:
scored.append((overlap, idea))
scored.sort(key=lambda x: x[0], reverse=True)
return [item for _, item in scored[:limit]]
def _rfc_foundations(self, category: str, limit: int = 10) -> list[tuple[str, int]]:
"""Get most-referenced RFCs, optionally filtered by category."""
top_refs = self.db.top_referenced(ref_type="rfc", limit=limit * 2)
if not category:
return [(ref_id, count) for ref_id, count, _ in top_refs[:limit]]
# Filter to RFCs referenced by drafts in this category
category_lower = category.lower()
pairs = self.db.drafts_with_ratings(limit=500)
category_drafts = set()
for draft, rating in pairs:
for cat in rating.categories:
if category_lower in cat.lower():
category_drafts.add(draft.name)
if not category_drafts:
return [(ref_id, count) for ref_id, count, _ in top_refs[:limit]]
filtered = []
for ref_id, count, draft_names in top_refs:
cat_count = sum(1 for d in draft_names if d in category_drafts)
if cat_count > 0:
filtered.append((ref_id, cat_count))
filtered.sort(key=lambda x: x[1], reverse=True)
return filtered[:limit]
def _similar_drafts(self, gap_desc: str, limit: int = 8) -> list[tuple[str, float]]:
"""Find semantically similar existing drafts via embeddings."""
all_embeddings = self.db.all_embeddings()
if not all_embeddings:
return []
# Try to embed the gap description via Ollama
try:
import ollama as ollama_lib
client = ollama_lib.Client(host=self.config.ollama_url)
resp = client.embed(
model=self.config.ollama_embed_model,
input=gap_desc[:8000],
)
gap_vec = np.array(resp["embeddings"][0], dtype=np.float32)
except Exception as e:
console.print(f"[yellow]Ollama embedding failed, skipping similarity: {e}[/]")
return []
similarities = []
for name, vec in all_embeddings.items():
sim = _cosine_similarity(gap_vec, vec)
similarities.append((name, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:limit]
def _top_rated_in_category(self, category: str, limit: int = 5) -> list[tuple]:
"""Get top-rated drafts in a category."""
pairs = self.db.drafts_with_ratings(limit=500)
if not category:
return [
(draft.name, draft.title, rating.composite_score)
for draft, rating in pairs[:limit]
]
category_lower = category.lower()
matching = []
for draft, rating in pairs:
for cat in rating.categories:
if category_lower in cat.lower():
matching.append((draft.name, draft.title, rating.composite_score))
break
return matching[:limit]
def _wg_context(self) -> str:
"""Summarize WG adoption status."""
adoption = self.db.draft_adoption_status()
wg_counts: dict[str, int] = {}
adopted_count = 0
for d in adoption:
if d["wg_adopted"]:
adopted_count += 1
wg = d["wg_name"]
wg_counts[wg] = wg_counts.get(wg, 0) + 1
total = len(adoption)
if not wg_counts:
return f"{total} drafts, none WG-adopted yet."
top_wgs = sorted(wg_counts.items(), key=lambda x: x[1], reverse=True)[:5]
wg_lines = ", ".join(f"{wg} ({n})" for wg, n in top_wgs)
return f"{total} drafts, {adopted_count} WG-adopted. Top WGs: {wg_lines}"
def _ecosystem_vision(self) -> str:
"""Load ecosystem vision document if it exists."""
vision_path = Path(self.config.data_dir) / "reports" / "holistic-agent-ecosystem-draft-outlines.md"
if not vision_path.exists():
return "(No ecosystem vision document found)"
text = vision_path.read_text()
# Return the pitch section (compact) rather than the full document
if "## 8. One-Page Pitch" in text:
pitch = text.split("## 8. One-Page Pitch")[1].strip()
return pitch[:2000]
# Fallback: return the vision summary
if "## 1. Vision Summary" in text:
parts = text.split("## 1. Vision Summary")[1]
if "## 2." in parts:
parts = parts.split("## 2.")[0]
return parts.strip()[:2000]
return text[:2000]
def _sibling_context(self, gap_topic: str) -> list[dict]:
"""Get outlines of sibling drafts from the same family."""
# Check all family drafts
families = self.db.get_generated_drafts()
if not families:
return []
# Find which family this gap_topic belongs to
topic_lower = gap_topic.lower()
family_name = ""
for gd in families:
if topic_lower in gd.get("gap_topic", "").lower():
family_name = gd.get("family_name", "")
break
if not family_name:
return []
siblings = self.db.get_family_drafts(family_name)
result = []
for s in siblings:
if s.get("gap_topic", "").lower() == topic_lower:
continue # Skip self
outline = {}
if s.get("outline_json"):
try:
outline = json.loads(s["outline_json"]) if isinstance(s["outline_json"], str) else s["outline_json"]
except (json.JSONDecodeError, TypeError):
pass
result.append({
"role": s.get("family_role", ""),
"title": s.get("title", ""),
"abstract": s.get("abstract", ""),
"outline": outline,
})
return result

View File

@@ -0,0 +1,219 @@
"""Family coordinator — orchestrates generation of the 5-draft ecosystem."""
from __future__ import annotations
import json
from rich.console import Console
from ..config import Config
from ..db import Database
from .generator import PipelineGenerator
from .quality import QualityGates
console = Console()
FAMILY_DRAFTS = [
{
"role": "AEM",
"topic": "Agent Ecosystem Model",
"description": (
"Core architecture and terminology for the agent ecosystem. "
"Defines shared concepts: DAG execution model, HITL points, "
"assurance levels, protocol agnosticism. Foundation for all "
"companion drafts."
),
},
{
"role": "ATD",
"topic": "Agent Task DAG",
"description": (
"Execution model using DAG structure with checkpoints and rollback. "
"Defines node semantics (pending/running/done/failed/rolled-back), "
"resource hints, circuit breakers, and rollback protocol. "
"Uses ECT as token and DAG format."
),
},
{
"role": "HITL",
"topic": "Human-in-the-Loop",
"description": (
"Human oversight as first-class primitive. Approval gates, "
"escalation paths, emergency override (PAUSE/CONSTRAIN/STOP/TAKEOVER), "
"and explainability hooks. Integrates with DAG as HITL nodes."
),
},
{
"role": "AEPB",
"topic": "Agent Ecosystem Protocol Bindings",
"description": (
"Cross-protocol interoperability layer. Capability advertisement, "
"protocol binding requirements, translation gateways, negotiation. "
"Makes ecosystem semantics available over any A2A protocol."
),
},
{
"role": "APAE",
"topic": "Agent Provenance Assurance Ecosystem",
"description": (
"Trust, verification, and provenance for dual-regime operation. "
"Assurance profiles (relaxed/standard/regulated), behavior verification, "
"dynamic trust scoring (AIMD model), provenance chains. "
"Same stack from K8s to fully proven."
),
},
]
class FamilyCoordinator:
def __init__(self, config: Config, db: Database, analyzer):
self.config = config
self.db = db
self.analyzer = analyzer
self.generator = PipelineGenerator(config, db, analyzer)
self.quality = QualityGates(config, db, analyzer)
def generate_family(self, family_name: str = "agent-ecosystem", cheap: bool = False) -> list[dict]:
"""Generate all 5 drafts in order. AEM first, then B-E with sibling context."""
console.print(f"\n[bold cyan]Generating draft family: {family_name}[/]")
console.print(f"Drafts: {len(FAMILY_DRAFTS)}, cheap={cheap}")
# Log the generation run
run_id = self.db.log_generation_run({
"family_name": family_name,
"gap_ids": [d["role"] for d in FAMILY_DRAFTS],
"model_used": self.config.claude_model_cheap if cheap else self.config.claude_model,
"status": "running",
})
results = []
total_in = 0
total_out = 0
for i, draft_spec in enumerate(FAMILY_DRAFTS):
console.print(
f"\n[bold]{'='*60}[/]"
f"\n[bold]Draft {i+1}/{len(FAMILY_DRAFTS)}: "
f"[cyan]{draft_spec['role']}[/] — {draft_spec['topic']}[/]"
f"\n[bold]{'='*60}[/]"
)
try:
result = self.generator.generate_full(
gap_topic=draft_spec["topic"],
cheap=cheap,
family_name=family_name,
family_role=draft_spec["role"],
)
results.append(result)
# Run quality gates
draft_id = result.get("id")
if draft_id:
console.print(f"\n[dim]Running quality gates for {draft_spec['role']}...[/]")
qr = self.quality.run_all(draft_id)
result["quality_results"] = qr
except Exception as e:
console.print(f"[red]Failed to generate {draft_spec['role']}: {e}[/]")
results.append({
"role": draft_spec["role"],
"topic": draft_spec["topic"],
"error": str(e),
})
# Update run
self.db.update_generation_run(
run_id,
status="completed",
completed_at=_now_iso(),
)
# Summary
console.print(f"\n[bold cyan]{'='*60}[/]")
console.print(f"[bold]Family generation complete: {family_name}[/]")
successful = [r for r in results if "error" not in r]
console.print(f" Generated: {len(successful)}/{len(FAMILY_DRAFTS)} drafts")
for r in results:
if "error" in r:
console.print(f" [red]FAIL[/] {r['role']}: {r['error']}")
else:
console.print(f" [green]OK[/] {r.get('family_role', '?')}: {r.get('title', '?')}")
return results
def check_consistency(self, family_name: str) -> dict:
"""Check terminology consistency across family drafts."""
drafts = self.db.get_family_drafts(family_name)
if not drafts:
return {"consistent": False, "details": "No drafts found for family"}
# Collect terminology from all outlines
all_terms: dict[str, dict[str, str]] = {} # term -> {role: definition}
for gd in drafts:
role = gd.get("family_role", "?")
outline_raw = gd.get("outline_json", "{}")
try:
outline = json.loads(outline_raw) if isinstance(outline_raw, str) else outline_raw
except (json.JSONDecodeError, TypeError):
continue
terms = outline.get("terminology", {})
if not isinstance(terms, dict):
continue
for term, defn in terms.items():
term_lower = term.lower()
if term_lower not in all_terms:
all_terms[term_lower] = {}
all_terms[term_lower][role] = defn
# Find terms used in multiple drafts
shared_terms = {t: roles for t, roles in all_terms.items() if len(roles) > 1}
if not shared_terms:
return {
"consistent": True,
"shared_terms": 0,
"details": "No shared terminology found across drafts",
}
# Check for inconsistencies (simple: different definitions for same term)
inconsistencies = []
for term, roles in shared_terms.items():
definitions = list(roles.values())
# Rough check: if definitions differ significantly
unique_defs = set(d.lower().strip().rstrip(".") for d in definitions)
if len(unique_defs) > 1:
inconsistencies.append({
"term": term,
"definitions": roles,
})
consistent = len(inconsistencies) == 0
details_parts = [f"{len(shared_terms)} shared terms across drafts"]
if inconsistencies:
details_parts.append(f"{len(inconsistencies)} inconsistencies found:")
for inc in inconsistencies:
details_parts.append(f" '{inc['term']}': {inc['definitions']}")
console.print(f"\n[bold]Consistency check: {family_name}[/]")
console.print(f" Shared terms: {len(shared_terms)}")
console.print(f" Inconsistencies: {len(inconsistencies)}")
if consistent:
console.print(" [green]All terminology consistent[/]")
else:
for inc in inconsistencies:
console.print(f" [yellow]Inconsistent: '{inc['term']}'[/]")
for role, defn in inc["definitions"].items():
console.print(f" {role}: {defn[:80]}")
return {
"consistent": consistent,
"shared_terms": len(shared_terms),
"inconsistencies": inconsistencies,
"details": "; ".join(details_parts),
}
def _now_iso() -> str:
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()

View File

@@ -0,0 +1,203 @@
"""Draft formatter — assembles outline + sections into I-D text format."""
from __future__ import annotations
import textwrap
from datetime import datetime, timezone, timedelta
class DraftFormatter:
@staticmethod
def format_draft(outline: dict, sections: list[str], family_name: str = "") -> str:
"""Assemble outline + sections into I-D text format."""
title = outline["title"]
draft_name = DraftFormatter._make_draft_name(title, family_name)
parts = []
parts.append(DraftFormatter._header_block(outline, draft_name))
parts.append("")
parts.append("Abstract")
parts.append("")
parts.append(DraftFormatter._wrap_text(outline.get("abstract", "")))
parts.append("")
parts.append(DraftFormatter._status_memo(outline))
parts.append("")
# Terminology section (if outline has terminology)
terms = outline.get("terminology", {})
if terms:
parts.append(DraftFormatter._terminology_section(outline))
parts.append("")
# Table of Contents
parts.append("Table of Contents")
parts.append("")
section_list = outline.get("sections", [])
for i, section in enumerate(section_list, 1):
stitle = section.get("title", f"Section {i}")
dots = "." * max(1, 60 - len(stitle))
parts.append(f" {i}. {stitle} {dots} {i + 2}")
ref_num = len(section_list) + 1
parts.append(f" {ref_num}. References {'.' * (60 - len('References'))} {ref_num + 2}")
parts.append("")
# Sections
for i, (section_info, section_text) in enumerate(
zip(section_list, sections), 1
):
stitle = section_info.get("title", f"Section {i}")
parts.append(f"{i}. {stitle}")
parts.append("")
parts.append(DraftFormatter._wrap_text(section_text))
parts.append("")
# References section
parts.append(DraftFormatter._references_section(outline))
parts.append("")
# Author's Address
parts.append("Author's Address")
parts.append("")
parts.append(" Generated by IETF Draft Analyzer")
if family_name:
parts.append(f" Family: {family_name}")
parts.append(f" {datetime.now(timezone.utc).strftime('%Y-%m-%d')}")
parts.append("")
return "\n".join(parts)
@staticmethod
def _make_draft_name(title: str, family_name: str = "") -> str:
"""Generate a draft name from title."""
words = title.lower().split()
slug = "-".join(w for w in words[:4] if w.isalnum())
if family_name:
return f"draft-{family_name}-{slug}-00"
return f"draft-ai-{slug}-00"
@staticmethod
def _header_block(outline: dict, draft_name: str) -> str:
"""Proper I-D header."""
now = datetime.now(timezone.utc)
expires = now + timedelta(days=185)
date_str = now.strftime("%B %Y")
exp_str = expires.strftime("%B %d, %Y")
status = outline.get("intended_status", "Informational")
wg = outline.get("target_wg", "individual")
title = outline["title"]
lines = []
lines.append(f"Internet-Draft{' ' * 45}{wg}")
lines.append(f"Intended status: {status:<44s}{date_str}")
lines.append(f"Expires: {exp_str}")
lines.append("")
lines.append("")
lines.append(f" {title}")
lines.append(f" {draft_name}")
return "\n".join(lines)
@staticmethod
def _status_memo(outline: dict) -> str:
"""Status of This Memo boilerplate."""
status = outline.get("intended_status", "Informational")
lines = []
lines.append("Status of This Memo")
lines.append("")
lines.append(DraftFormatter._wrap_text(
"This Internet-Draft is submitted in full conformance with the "
"provisions of BCP 78 and BCP 79."
))
lines.append("")
lines.append(DraftFormatter._wrap_text(
f"This document is intended to have {status} status. "
"Distribution of this memo is unlimited."
))
return "\n".join(lines)
@staticmethod
def _references_section(outline: dict) -> str:
"""Normative + Informative References from outline data."""
lines = []
norm_refs = outline.get("normative_refs", [])
info_refs = outline.get("informative_refs", [])
ref_num = len(outline.get("sections", [])) + 1
lines.append(f"{ref_num}. References")
lines.append("")
if norm_refs:
lines.append(f"{ref_num}.1. Normative References")
lines.append("")
for ref in norm_refs:
lines.append(f" [{ref}]")
lines.append(f" {ref}")
lines.append("")
if info_refs:
sub = "2" if norm_refs else "1"
lines.append(f"{ref_num}.{sub}. Informative References")
lines.append("")
for ref in info_refs:
lines.append(f" [{ref}]")
lines.append(f" {ref}")
lines.append("")
if not norm_refs and not info_refs:
lines.append(" (No references specified)")
lines.append("")
return "\n".join(lines)
@staticmethod
def _terminology_section(outline: dict) -> str:
"""Terminology section from outline terminology dict."""
terms = outline.get("terminology", {})
if not terms:
return ""
lines = []
lines.append("Terminology")
lines.append("")
lines.append(DraftFormatter._wrap_text(
'The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL '
'NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", '
'"MAY", and "OPTIONAL" in this document are to be interpreted as '
'described in BCP 14 [RFC2119] [RFC8174] when, and only when, they '
'appear in all capitals, as shown here.'
))
lines.append("")
for term, definition in terms.items():
lines.append(f" {term}")
lines.append(DraftFormatter._wrap_text(definition, indent=6))
lines.append("")
return "\n".join(lines)
@staticmethod
def _wrap_text(text: str, indent: int = 3, width: int = 69) -> str:
"""72-char line wrapping for I-D format."""
prefix = " " * indent
paragraphs = text.strip().split("\n\n")
wrapped = []
for para in paragraphs:
# Preserve list items
if para.strip().startswith("-") or para.strip().startswith("*"):
inner_lines = para.strip().split("\n")
for line in inner_lines:
line = line.strip()
sub_lines = textwrap.wrap(
line, width=width,
initial_indent=prefix,
subsequent_indent=prefix + " ",
)
wrapped.append("\n".join(sub_lines))
else:
para = " ".join(para.split()) # Normalize whitespace
lines = textwrap.wrap(
para, width=width,
initial_indent=prefix,
subsequent_indent=prefix,
)
wrapped.append("\n".join(lines))
return "\n\n".join(wrapped)

View File

@@ -0,0 +1,269 @@
"""Pipeline generator — enhanced outline + section generation with rich context."""
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn
from ..config import Config
from ..db import Database
from .context import ContextBuilder
from .prompts import OUTLINE_PROMPT_V2, SECTION_PROMPT_V2
from .formatter import DraftFormatter
console = Console()
def _prompt_hash(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:16]
class PipelineGenerator:
def __init__(self, config: Config, db: Database, analyzer):
self.config = config
self.db = db
self.analyzer = analyzer
self.context_builder = ContextBuilder(config, db)
def _format_ideas_for_prompt(self, ideas: list[dict]) -> str:
if not ideas:
return "(none found)"
lines = []
for idea in ideas:
lines.append(
f"- [{idea.get('type', '?')}] {idea['title']}: "
f"{idea['description']} (from {idea.get('draft_name', '?')})"
)
return "\n".join(lines)
def _format_rfcs_for_prompt(self, rfcs: list[tuple[str, int]]) -> str:
if not rfcs:
return "(none found)"
return "\n".join(f"- RFC {ref_id} (cited by {count} drafts)" for ref_id, count in rfcs)
def _format_similar_for_prompt(self, similar: list[tuple[str, float]]) -> str:
if not similar:
return "(none found)"
lines = []
for name, sim in similar:
draft = self.db.get_draft(name)
title = draft.title if draft else name
lines.append(f"- {name}: {title} (similarity: {sim:.2f})")
return "\n".join(lines)
def _format_top_rated_for_prompt(self, top_rated: list[tuple]) -> str:
if not top_rated:
return "(none found)"
return "\n".join(
f"- {name}: {title} (score: {score:.1f})"
for name, title, score in top_rated
)
def _format_siblings_for_prompt(self, siblings: list[dict]) -> str:
if not siblings:
return "(none — this is the first draft in the family)"
lines = []
for s in siblings:
role = s.get("role", "?")
title = s.get("title", "?")
abstract = s.get("abstract", "")[:200]
outline = s.get("outline", {})
sections = outline.get("sections", [])
section_titles = [sec.get("title", "") for sec in sections]
lines.append(
f"- [{role}] {title}\n"
f" Abstract: {abstract}\n"
f" Sections: {', '.join(section_titles)}"
)
return "\n".join(lines)
def _format_terminology_for_prompt(self, outline: dict) -> str:
terms = outline.get("terminology", {})
if not terms:
return "(none defined yet)"
return "\n".join(f"- **{term}**: {defn}" for term, defn in terms.items())
def generate_outline(self, context: dict, cheap: bool = False) -> dict:
"""Generate outline from assembled context. Returns outline dict."""
gap = context["gap"]
prompt = OUTLINE_PROMPT_V2.format(
gap_topic=gap["topic"],
gap_description=gap["description"],
gap_category=gap.get("category", ""),
gap_evidence=gap.get("evidence", ""),
gap_severity=gap.get("severity", "medium"),
convergent_ideas=self._format_ideas_for_prompt(context["convergent_ideas"]),
rfc_foundations=self._format_rfcs_for_prompt(context["rfc_foundations"]),
similar_drafts=self._format_similar_for_prompt(context["similar_drafts"]),
top_rated=self._format_top_rated_for_prompt(context["top_rated"]),
wg_context=context["wg_context"],
ecosystem_vision=context["ecosystem_vision"],
sibling_context=self._format_siblings_for_prompt(context["sibling_context"]),
)
phash = _prompt_hash("pipeline-outline-" + prompt)
cache_key = f"_pipeline_{gap['topic']}_"
# Check cache
cached = self.db.get_cached_response(cache_key, phash)
if cached:
try:
return json.loads(cached)
except (json.JSONDecodeError, KeyError):
pass
text, in_tok, out_tok = self.analyzer._call_claude(
prompt, max_tokens=4096, cheap=cheap
)
text = self.analyzer._extract_json(text)
outline = json.loads(text)
self.db.cache_response(
cache_key, phash,
self.config.claude_model_cheap if cheap else self.config.claude_model,
prompt, text, in_tok, out_tok,
)
return outline
def generate_section(self, outline: dict, section_idx: int, context: dict, cheap: bool = False) -> str:
"""Generate a single section with relevant ideas and refs."""
sections = outline["sections"]
section = sections[section_idx]
outline_text = "\n".join(
f"{i+1}. {s['title']}: {s.get('summary', '')}"
for i, s in enumerate(sections)
)
# Find ideas relevant to this section
key_ideas = section.get("key_ideas", [])
relevant_ideas = []
if key_ideas and context["convergent_ideas"]:
for idea in context["convergent_ideas"]:
for key in key_ideas:
if key.lower() in idea["title"].lower() or key.lower() in idea["description"].lower():
relevant_ideas.append(idea)
break
if not relevant_ideas:
# Use top 3 convergent ideas as fallback
relevant_ideas = context["convergent_ideas"][:3]
# Format RFC refs
rfc_refs = ""
norm_refs = outline.get("normative_refs", [])
info_refs = outline.get("informative_refs", [])
all_refs = norm_refs + info_refs
if all_refs:
rfc_refs = "\n".join(f"- {ref}" for ref in all_refs[:10])
else:
rfc_refs = self._format_rfcs_for_prompt(context["rfc_foundations"][:5])
# Format cross-references to siblings
cross_refs = self._format_siblings_for_prompt(context["sibling_context"])
prompt = SECTION_PROMPT_V2.format(
draft_title=outline["title"],
abstract=outline["abstract"],
outline_text=outline_text,
section_num=section_idx + 1,
section_title=section["title"],
section_summary=section.get("summary", ""),
relevant_ideas=self._format_ideas_for_prompt(relevant_ideas),
rfc_refs=rfc_refs,
cross_refs=cross_refs,
terminology=self._format_terminology_for_prompt(outline),
)
phash = _prompt_hash("pipeline-section-" + prompt)
cache_key = f"_pipeline_{outline['title']}_s{section_idx}_"
# Check cache
cached = self.db.get_cached_response(cache_key, phash)
if cached:
return cached
text, in_tok, out_tok = self.analyzer._call_claude(
prompt, max_tokens=2048, cheap=cheap
)
self.db.cache_response(
cache_key, phash,
self.config.claude_model_cheap if cheap else self.config.claude_model,
prompt, text, in_tok, out_tok,
)
return text
def generate_full(self, gap_topic: str, cheap: bool = False,
family_name: str = "", family_role: str = "") -> dict:
"""Full pipeline: context -> outline -> sections -> assemble -> store in DB."""
console.print(f"\n[bold]Pipeline: {gap_topic}[/]")
# Step 1: Build context
console.print("[dim]Step 1/4:[/] Building context...")
context = self.context_builder.build_context(gap_topic)
console.print(
f" Ideas: {len(context['convergent_ideas'])}, "
f"RFCs: {len(context['rfc_foundations'])}, "
f"Similar: {len(context['similar_drafts'])}, "
f"Siblings: {len(context['sibling_context'])}"
)
# Step 2: Generate outline
console.print("[dim]Step 2/4:[/] Generating outline...")
outline = self.generate_outline(context, cheap=cheap)
console.print(f" Title: [cyan]{outline['title']}[/]")
console.print(f" Sections: {len(outline['sections'])}")
console.print(f" Status: {outline.get('intended_status', '?')}")
# Step 3: Generate sections
console.print("[dim]Step 3/4:[/] Generating sections...")
sections = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task("Writing...", total=len(outline["sections"]))
for i, s in enumerate(outline["sections"]):
progress.update(task, description=f"Section: {s['title'][:30]}")
text = self.generate_section(outline, i, context, cheap=cheap)
sections.append(text)
progress.advance(task)
# Step 4: Assemble and store
console.print("[dim]Step 4/4:[/] Assembling draft...")
full_text = DraftFormatter.format_draft(outline, sections, family_name=family_name)
# Generate draft name from title
words = outline["title"].lower().split()
slug = "-".join(w for w in words[:4] if w.isalnum())
draft_name = f"draft-ai-{slug}-00"
data = {
"gap_topic": gap_topic,
"draft_name": draft_name,
"title": outline["title"],
"abstract": outline.get("abstract", ""),
"outline": outline,
"sections": sections,
"full_text": full_text,
"family_name": family_name,
"family_role": family_role,
"version": 0,
"status": "draft",
}
draft_id = self.db.upsert_generated_draft(data)
console.print(f" Stored as generated_draft id={draft_id}, name={draft_name}")
data["id"] = draft_id
return data

View File

@@ -0,0 +1,92 @@
"""Prompt templates for the gap-to-draft generation pipeline."""
from __future__ import annotations
OUTLINE_PROMPT_V2 = """\
You are writing an IETF Internet-Draft to address a gap in the AI/agent standardization landscape.
## Gap to Address
Topic: {gap_topic}
Description: {gap_description}
Category: {gap_category}
Evidence: {gap_evidence}
Severity: {gap_severity}
## Convergent Ideas from Existing Drafts
These ideas from the current landscape converge on this topic — build on them, don't duplicate:
{convergent_ideas}
## RFC Foundations
Most-referenced RFCs in this space — cite where relevant:
{rfc_foundations}
## Similar Existing Drafts
These drafts are closest to this gap — differentiate from them:
{similar_drafts}
## Top-Rated Drafts in Category
Drafts the community considers strong in this area:
{top_rated}
## Working Group Context
{wg_context}
## Ecosystem Vision
{ecosystem_vision}
## Sibling Drafts (same family)
{sibling_context}
Generate a detailed outline for an Internet-Draft that fills this gap.
Return JSON:
{{
"title": "full draft title",
"abstract": "150-250 word abstract",
"sections": [
{{"title": "section title", "summary": "2-3 sentence summary of content", "key_ideas": ["idea titles to incorporate"]}}
],
"normative_refs": ["RFC NNNN", "draft-name"],
"informative_refs": ["RFC NNNN", "draft-name"],
"terminology": {{"term": "definition"}},
"target_wg": "suggested IETF working group",
"intended_status": "informational|standards-track|experimental"
}}
Requirements:
- Include standard sections: Introduction, Terminology, Problem Statement, then 2-4 technical sections, Security Considerations, IANA Considerations
- Reference specific RFCs and drafts from the context above
- Use terminology consistent with sibling drafts if any
- Abstract should clearly state the problem, approach, and contribution
JSON only, no fences."""
SECTION_PROMPT_V2 = """\
Write the following section of an Internet-Draft titled "{draft_title}".
Abstract: {abstract}
Full outline:
{outline_text}
Write section {section_num}: {section_title}
Summary: {section_summary}
## Relevant Ideas to Incorporate
{relevant_ideas}
## RFC References to Cite
{rfc_refs}
## Cross-References to Sister Drafts
{cross_refs}
## Terminology
{terminology}
Follow IETF Internet-Draft conventions:
- Formal, precise technical language
- Use RFC 2119 keywords (MUST, SHOULD, MAY) where appropriate
- Reference existing RFCs and drafts where relevant (use [RFCNNNN] format)
- 3-6 paragraphs per section
- Use the terminology definitions provided above consistently
Write the section content only (no section number or title). Plain text."""

View File

@@ -0,0 +1,277 @@
"""Quality gates for generated drafts — novelty, references, format, self-rating."""
from __future__ import annotations
import json
import re
from datetime import datetime, timezone
import numpy as np
from rich.console import Console
from ..config import Config
from ..db import Database
console = Console()
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
dot = np.dot(a, b)
norm = np.linalg.norm(a) * np.linalg.norm(b)
if norm == 0:
return 0.0
return float(dot / norm)
REQUIRED_SECTIONS = ["introduction", "security considerations", "iana considerations"]
class QualityGates:
def __init__(self, config: Config, db: Database, analyzer):
self.config = config
self.db = db
self.analyzer = analyzer
def run_all(self, draft_id: int) -> dict:
"""Run all quality gates. Returns {gate_name: {passed: bool, score: float, details: str}}"""
results = {}
results["novelty"] = self.check_novelty(draft_id)
results["references"] = self.check_references(draft_id)
results["format"] = self.check_format(draft_id)
results["self_rating"] = self.check_self_rating(draft_id)
passed = sum(1 for r in results.values() if r["passed"])
total = len(results)
console.print(
f"Quality gates: [{'green' if passed == total else 'yellow'}]"
f"{passed}/{total} passed[/]"
)
for name, result in results.items():
status = "[green]PASS[/]" if result["passed"] else "[red]FAIL[/]"
console.print(f" {status} {name}: {result['details']}")
return results
def check_novelty(self, draft_id: int) -> dict:
"""Embed generated abstract, compare against all existing drafts.
Flag if max_similarity > 0.90."""
gd = self.db.get_generated_draft(draft_id)
if not gd:
return {"passed": False, "score": 0.0, "details": "Draft not found"}
abstract = gd.get("abstract", "")
title = gd.get("title", "")
text_to_embed = f"{title}\n\n{abstract}"
if not text_to_embed.strip():
return {"passed": False, "score": 0.0, "details": "No abstract to check"}
# Embed via Ollama
try:
import ollama as ollama_lib
client = ollama_lib.Client(host=self.config.ollama_url)
resp = client.embed(
model=self.config.ollama_embed_model,
input=text_to_embed[:8000],
)
gen_vec = np.array(resp["embeddings"][0], dtype=np.float32)
except Exception as e:
return {"passed": True, "score": 0.0,
"details": f"Ollama unavailable, skipping novelty check: {e}"}
all_embeddings = self.db.all_embeddings()
if not all_embeddings:
return {"passed": True, "score": 1.0, "details": "No existing embeddings to compare"}
max_sim = 0.0
most_similar = ""
for name, vec in all_embeddings.items():
sim = _cosine_similarity(gen_vec, vec)
if sim > max_sim:
max_sim = sim
most_similar = name
passed = max_sim < 0.90
return {
"passed": passed,
"score": 1.0 - max_sim,
"details": (
f"Max similarity: {max_sim:.3f} with {most_similar}"
+ ("" if passed else " — too similar, needs differentiation")
),
}
def check_references(self, draft_id: int) -> dict:
"""Extract RFC/draft refs via regex, cross-check against draft_refs table."""
gd = self.db.get_generated_draft(draft_id)
if not gd:
return {"passed": False, "score": 0.0, "details": "Draft not found"}
full_text = gd.get("full_text", "")
if not full_text:
return {"passed": False, "score": 0.0, "details": "No full text"}
# Extract references from generated text
rfc_pattern = re.compile(r'\[?RFC\s*(\d{3,5})\]?', re.IGNORECASE)
draft_pattern = re.compile(r'(draft-[a-z0-9-]+)', re.IGNORECASE)
found_rfcs = set(rfc_pattern.findall(full_text))
found_drafts = set(draft_pattern.findall(full_text))
total_refs = len(found_rfcs) + len(found_drafts)
# Cross-check: how many of these RFCs are actually in our DB?
known_rfcs = set()
for ref_id in found_rfcs:
drafts = self.db.drafts_referencing("rfc", ref_id)
if drafts:
known_rfcs.add(ref_id)
# Cross-check: how many referenced drafts exist in our DB?
known_drafts = set()
for dname in found_drafts:
if self.db.get_draft(dname):
known_drafts.add(dname)
verified = len(known_rfcs) + len(known_drafts)
score = verified / total_refs if total_refs > 0 else 0.0
passed = total_refs >= 3 and score >= 0.3
return {
"passed": passed,
"score": score,
"details": (
f"{total_refs} refs found ({len(found_rfcs)} RFCs, {len(found_drafts)} drafts), "
f"{verified} verified in DB ({score:.0%})"
),
}
def check_format(self, draft_id: int) -> dict:
"""Check line length <= 72, required sections present, no markdown leaked."""
gd = self.db.get_generated_draft(draft_id)
if not gd:
return {"passed": False, "score": 0.0, "details": "Draft not found"}
full_text = gd.get("full_text", "")
if not full_text:
return {"passed": False, "score": 0.0, "details": "No full text"}
issues = []
# Check line length
lines = full_text.split("\n")
long_lines = [i + 1 for i, line in enumerate(lines) if len(line) > 72]
if long_lines:
issues.append(f"{len(long_lines)} lines exceed 72 chars")
# Check required sections
text_lower = full_text.lower()
for section in REQUIRED_SECTIONS:
if section not in text_lower:
issues.append(f"Missing required section: {section}")
# Check for leaked markdown
markdown_patterns = [
(r'^#{1,3}\s', "markdown headers (# )"),
(r'\*\*[^*]+\*\*', "bold markdown (**text**)"),
(r'```', "code fences (```)"),
(r'\[([^\]]+)\]\(http', "markdown links"),
]
for pattern, desc in markdown_patterns:
if re.search(pattern, full_text, re.MULTILINE):
issues.append(f"Leaked markdown: {desc}")
if not issues:
return {"passed": True, "score": 1.0, "details": "All format checks pass"}
score = max(0.0, 1.0 - len(issues) * 0.25)
return {
"passed": len(issues) <= 1, # Allow one minor issue
"score": score,
"details": "; ".join(issues),
}
def check_self_rating(self, draft_id: int) -> dict:
"""Feed through existing rate_draft() pipeline. Score on same 1-5 scale."""
gd = self.db.get_generated_draft(draft_id)
if not gd:
return {"passed": False, "score": 0.0, "details": "Draft not found"}
# Create a temporary prompt matching the analyzer's rating format
title = gd.get("title", "")
abstract = gd.get("abstract", "")
draft_name = gd.get("draft_name", "")
from ..analyzer import RATE_PROMPT_COMPACT, CATEGORIES_SHORT, _prompt_hash
prompt = RATE_PROMPT_COMPACT.format(
name=draft_name,
title=title,
time=datetime.now(timezone.utc).strftime("%Y-%m-%d"),
pages="?",
abstract=abstract[:2000],
categories=", ".join(CATEGORIES_SHORT),
)
phash = _prompt_hash("self-rate-" + prompt)
cache_key = f"_selfrate_{draft_id}_"
# Check cache
cached = self.db.get_cached_response(cache_key, phash)
if cached:
try:
data = json.loads(cached)
return self._parse_self_rating(data, draft_id)
except (json.JSONDecodeError, KeyError):
pass
try:
text, in_tok, out_tok = self.analyzer._call_claude(prompt, max_tokens=512, cheap=True)
text = self.analyzer._extract_json(text)
data = json.loads(text)
self.db.cache_response(
cache_key, phash,
self.config.claude_model_cheap,
prompt, text, in_tok, out_tok,
)
return self._parse_self_rating(data, draft_id)
except Exception as e:
return {"passed": False, "score": 0.0,
"details": f"Self-rating failed: {e}"}
def _parse_self_rating(self, data: dict, draft_id: int) -> dict:
"""Parse self-rating result and update the generated draft."""
novelty = int(data.get("n", data.get("novelty", 3)))
maturity = int(data.get("m", data.get("maturity", 3)))
relevance = int(data.get("r", data.get("relevance", 3)))
overlap = int(data.get("o", data.get("overlap", 3)))
momentum = int(data.get("mo", data.get("momentum", 3)))
composite = (
novelty * 0.30
+ relevance * 0.25
+ maturity * 0.20
+ momentum * 0.15
+ (6 - overlap) * 0.10
)
# Store rating on the generated draft
gd = self.db.get_generated_draft(draft_id)
if gd:
self.db.conn.execute(
"UPDATE generated_drafts SET rating_json = ?, quality_score = ? WHERE id = ?",
(json.dumps(data), composite, draft_id),
)
self.db.conn.commit()
passed = composite >= 2.5
return {
"passed": passed,
"score": composite / 5.0,
"details": (
f"Composite: {composite:.1f}/5 "
f"(N:{novelty} M:{maturity} O:{overlap} Mo:{momentum} R:{relevance})"
),
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
"""Multi-source document fetcher registry."""
from .base import SourceDocument, SourceFetcher
from .ietf import IETFFetcher
from .w3c import W3CFetcher
FETCHERS = {"ietf": IETFFetcher, "w3c": W3CFetcher}
def get_fetcher(source_name: str, config=None):
cls = FETCHERS.get(source_name)
if cls is None:
raise ValueError(f"Unknown source: {source_name}")
return cls(config)

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Protocol
@dataclass
class SourceDocument:
"""Generic document from any standards body."""
name: str # Unique identifier (e.g. "draft-foo-bar", "webnn-api")
title: str
abstract: str
source: str # "ietf", "w3c", etc.
source_id: str = "" # Body-specific ID
source_url: str = "" # Canonical URL
full_text: str | None = None
time: str = "" # ISO date
doc_status: str = "" # "active", "published", "expired", etc.
extra: dict = field(default_factory=dict) # Body-specific metadata
class SourceFetcher(Protocol):
"""Protocol for standards body fetchers."""
def search(
self, keywords: list[str], since: str | None = None
) -> list[SourceDocument]: ...
def download_text(self, doc: SourceDocument) -> str | None: ...
def close(self) -> None: ...

View File

@@ -0,0 +1,82 @@
"""IETF Datatracker adapter — delegates to existing Fetcher."""
from __future__ import annotations
from ..config import Config
from ..fetcher import Fetcher
from ..models import Draft
from .base import SourceDocument
class IETFFetcher:
"""IETF Datatracker adapter wrapping the existing Fetcher class."""
def __init__(self, config: Config | None = None):
self.config = config or Config.load()
self._fetcher = Fetcher(self.config)
def search(
self, keywords: list[str], since: str | None = None
) -> list[SourceDocument]:
"""Search Datatracker, convert Draft -> SourceDocument."""
drafts = self._fetcher.search_drafts(keywords=keywords, since=since)
return [self._draft_to_doc(d) for d in drafts]
def download_text(self, doc: SourceDocument) -> str | None:
"""Download full text for a SourceDocument."""
draft = self._doc_to_draft(doc)
return self._fetcher.download_full_text(draft)
def close(self) -> None:
self._fetcher.close()
@staticmethod
def _draft_to_doc(draft: Draft) -> SourceDocument:
return SourceDocument(
name=draft.name,
title=draft.title,
abstract=draft.abstract,
source="ietf",
source_id=str(draft.dt_id) if draft.dt_id else "",
source_url=draft.datatracker_url,
full_text=draft.full_text,
time=draft.time or "",
doc_status="active",
extra={
"rev": draft.rev,
"pages": draft.pages,
"words": draft.words,
"group": draft.group,
"group_uri": draft.group_uri,
"expires": draft.expires,
"ad": draft.ad,
"shepherd": draft.shepherd,
"states": draft.states,
"fetched_at": draft.fetched_at,
},
)
@staticmethod
def _doc_to_draft(doc: SourceDocument) -> Draft:
extra = doc.extra or {}
return Draft(
name=doc.name,
rev=extra.get("rev", "00"),
title=doc.title,
abstract=doc.abstract,
time=doc.time,
dt_id=int(doc.source_id) if doc.source_id else None,
pages=extra.get("pages"),
words=extra.get("words"),
group=extra.get("group"),
group_uri=extra.get("group_uri"),
expires=extra.get("expires"),
ad=extra.get("ad"),
shepherd=extra.get("shepherd"),
states=extra.get("states", []),
full_text=doc.full_text,
fetched_at=extra.get("fetched_at"),
source="ietf",
source_id=doc.source_id,
source_url=doc.source_url,
)

View File

@@ -0,0 +1,187 @@
"""Fetch specs from W3C public API."""
from __future__ import annotations
import re
import time as time_mod
import httpx
from rich.console import Console
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
SpinnerColumn,
TextColumn,
)
from ..config import Config
from .base import SourceDocument
W3C_API = "https://api.w3.org"
console = Console()
def _strip_html(html: str) -> str:
"""Minimal HTML tag stripper — no heavy dependencies."""
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL)
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"&nbsp;", " ", text)
text = re.sub(r"&amp;", "&", text)
text = re.sub(r"&lt;", "<", text)
text = re.sub(r"&gt;", ">", text)
text = re.sub(r"&#\d+;", "", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
class W3CFetcher:
"""Fetch specs from the W3C public API (no auth needed)."""
def __init__(self, config: Config | None = None):
self.config = config or Config.load()
self.client = httpx.Client(timeout=30, follow_redirects=True)
self.groups = self.config.w3c_groups
def search(
self, keywords: list[str], since: str | None = None
) -> list[SourceDocument]:
"""Fetch specs from AI-relevant W3C groups, filtered by keywords."""
seen: dict[str, SourceDocument] = {}
kw_lower = [k.lower() for k in keywords]
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task("Fetching W3C specs...", total=len(self.groups))
for group in self.groups:
progress.update(task, description=f"W3C group: {group}")
specs = self._fetch_group_specs(group)
for spec in specs:
# Client-side keyword filter on title + description
haystack = (spec.title + " " + spec.abstract).lower()
if any(kw in haystack for kw in kw_lower):
if since and spec.time and spec.time < since:
continue
if spec.name not in seen:
seen[spec.name] = spec
progress.advance(task)
console.print(f"Found [bold green]{len(seen)}[/] W3C specs matching keywords")
return list(seen.values())
def _fetch_group_specs(self, group_shortname: str) -> list[SourceDocument]:
"""Fetch all specifications for a W3C group."""
url = f"{W3C_API}/groups/{group_shortname}/specifications"
specs: list[SourceDocument] = []
try:
page = 1
while True:
resp = self.client.get(
url,
params={"format": "json", "page": page},
headers={"Accept": "application/json"},
)
resp.raise_for_status()
data = resp.json()
spec_list = data if isinstance(data, list) else data.get("_links", {}).get("specifications", [])
if not spec_list:
# Try alternate response shape
spec_list = data.get("specifications", [])
if not spec_list:
break
for item in spec_list:
href = item.get("href", "")
shortname = item.get("shortname", "")
title = item.get("title", shortname)
if not shortname and href:
# Extract shortname from href like /specifications/webnn
parts = href.rstrip("/").split("/")
shortname = parts[-1] if parts else ""
if not shortname:
continue
# Fetch spec detail for abstract/description
detail = self._fetch_spec_detail(shortname)
abstract = detail.get("description", title)
spec_url = detail.get("editor-draft", detail.get("url", f"https://www.w3.org/TR/{shortname}/"))
status = detail.get("status", "")
date = detail.get("date", "")
specs.append(
SourceDocument(
name=f"w3c-{shortname}",
title=title,
abstract=abstract,
source="w3c",
source_id=shortname,
source_url=spec_url,
time=date,
doc_status=status,
extra={"group": group_shortname},
)
)
time_mod.sleep(0.3)
# Check pagination
pages = data.get("pages", 1) if isinstance(data, dict) else 1
if page >= pages:
break
page += 1
time_mod.sleep(0.3)
except httpx.HTTPError as e:
console.print(f"[yellow]W3C API error for {group_shortname}: {e}[/]")
return specs
def _fetch_spec_detail(self, shortname: str) -> dict:
"""Fetch detail for a single spec."""
try:
resp = self.client.get(
f"{W3C_API}/specifications/{shortname}",
headers={"Accept": "application/json"},
)
resp.raise_for_status()
data = resp.json()
return {
"description": data.get("description", ""),
"title": data.get("title", shortname),
"editor-draft": data.get("editor-draft", ""),
"url": data.get("_links", {}).get("latest-version", {}).get("href", ""),
"status": data.get("_links", {}).get("latest-version", {}).get("status", ""),
"date": data.get("_links", {}).get("latest-version", {}).get("date", ""),
}
except httpx.HTTPError:
return {}
def download_text(self, doc: SourceDocument) -> str | None:
"""Fetch spec URL content and strip HTML to plain text."""
url = doc.source_url
if not url:
return None
try:
resp = self.client.get(url)
resp.raise_for_status()
content_type = resp.headers.get("content-type", "")
if "html" in content_type:
return _strip_html(resp.text)[:50000]
return resp.text[:50000]
except httpx.HTTPError as e:
console.print(f"[dim]Could not download text for {doc.name}: {e}[/]")
return None
def close(self) -> None:
self.client.close()

View File

@@ -449,9 +449,10 @@ class Visualizer:
if len(G.nodes) == 0:
raise RuntimeError(f"No edges with min_shared={min_shared}.")
# Get affiliations for coloring
# Get affiliations for coloring (normalized)
from .orgs import normalize_org
top_authors = self.db.top_authors(limit=200)
author_aff = {name: aff for name, aff, _, _ in top_authors}
author_aff = {name: normalize_org(aff) for name, aff, _, _ in top_authors}
# Node sizing by degree
degrees = dict(G.degree())
@@ -650,7 +651,8 @@ class Visualizer:
"""
import plotly.express as px
orgs = self.db.top_orgs(limit=20)
from .orgs import top_orgs_normalized
orgs = top_orgs_normalized(self.db, limit=20)
if not orgs:
raise RuntimeError("No author data. Run `ietf authors --fetch` first.")