v0.3.0: Gap-to-Draft pipeline, Living Standards Observatory, blog series
Gap-to-Draft Pipeline (ietf pipeline): - Context builder assembles ideas, RFC foundations, similar drafts, ecosystem vision - Generator produces outlines + sections using rich context with Claude - Quality gates: novelty (embedding similarity), references, format, self-rating - Family coordinator generates 5-draft ecosystem (AEM/ATD/HITL/AEPB/APAE) - I-D formatter with proper headers, references, 72-char wrapping Living Standards Observatory (ietf observatory): - Source abstraction with IETF + W3C fetchers - 7-step update pipeline: snapshot, fetch, analyze, embed, ideas, gaps, record - Static GitHub Pages dashboard (explorer, gap tracker, timeline) - Weekly CI/CD automation via GitHub Actions Also includes: - 361 drafts (expanded from 260 with 6 new keywords), 403 authors, 1,262 ideas, 12 gaps - Blog series (8 posts planned), reports, arXiv paper figures - Agent team infrastructure (CLAUDE.md, scripts, dev journal) - 5 new DB tables, schema migration, ~15 new query methods Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
259
src/ietf_analyzer/pipeline/context.py
Normal file
259
src/ietf_analyzer/pipeline/context.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""Context builder — assembles rich context for draft generation from DB queries."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from rich.console import Console
|
||||
|
||||
from ..config import Config
|
||||
from ..db import Database
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||||
dot = np.dot(a, b)
|
||||
norm = np.linalg.norm(a) * np.linalg.norm(b)
|
||||
if norm == 0:
|
||||
return 0.0
|
||||
return float(dot / norm)
|
||||
|
||||
|
||||
class ContextBuilder:
|
||||
def __init__(self, config: Config, db: Database):
|
||||
self.config = config
|
||||
self.db = db
|
||||
|
||||
def build_context(self, gap_topic: str) -> dict:
|
||||
"""Assemble full context for a gap topic. All DB queries, zero Claude calls."""
|
||||
gap = self._find_gap(gap_topic)
|
||||
if not gap:
|
||||
console.print(f"[yellow]No gap found matching '{gap_topic}', using topic as-is[/]")
|
||||
gap = {
|
||||
"id": 0,
|
||||
"topic": gap_topic,
|
||||
"description": gap_topic,
|
||||
"category": "",
|
||||
"evidence": "",
|
||||
"severity": "medium",
|
||||
}
|
||||
|
||||
ideas = self._convergent_ideas(gap)
|
||||
rfcs = self._rfc_foundations(gap.get("category", ""))
|
||||
similar = self._similar_drafts(gap["description"])
|
||||
top_rated = self._top_rated_in_category(gap.get("category", ""))
|
||||
wg_context = self._wg_context()
|
||||
ecosystem = self._ecosystem_vision()
|
||||
siblings = self._sibling_context(gap_topic)
|
||||
|
||||
return {
|
||||
"gap": gap,
|
||||
"convergent_ideas": ideas,
|
||||
"rfc_foundations": rfcs,
|
||||
"similar_drafts": similar,
|
||||
"top_rated": top_rated,
|
||||
"wg_context": wg_context,
|
||||
"ecosystem_vision": ecosystem,
|
||||
"sibling_context": siblings,
|
||||
}
|
||||
|
||||
def _find_gap(self, topic: str) -> dict | None:
|
||||
"""Find a gap by topic string (fuzzy match)."""
|
||||
gaps = self.db.all_gaps()
|
||||
topic_lower = topic.lower()
|
||||
# Exact match first
|
||||
for g in gaps:
|
||||
if g["topic"].lower() == topic_lower:
|
||||
return g
|
||||
# Substring match
|
||||
for g in gaps:
|
||||
if topic_lower in g["topic"].lower() or topic_lower in g["description"].lower():
|
||||
return g
|
||||
# Word overlap match
|
||||
topic_words = set(topic_lower.split())
|
||||
best = None
|
||||
best_score = 0
|
||||
for g in gaps:
|
||||
gap_words = set(g["topic"].lower().split()) | set(g["description"].lower().split())
|
||||
overlap = len(topic_words & gap_words)
|
||||
if overlap > best_score:
|
||||
best_score = overlap
|
||||
best = g
|
||||
return best if best_score >= 2 else None
|
||||
|
||||
def _convergent_ideas(self, gap: dict, limit: int = 20) -> list[dict]:
|
||||
"""Find ideas that converge on this gap topic via keyword matching."""
|
||||
all_ideas = self.db.all_ideas()
|
||||
if not all_ideas:
|
||||
return []
|
||||
|
||||
# Build search terms from gap topic + description
|
||||
search_text = (gap["topic"] + " " + gap["description"]).lower()
|
||||
search_words = set(search_text.split())
|
||||
# Remove common words
|
||||
stop_words = {"the", "a", "an", "and", "or", "in", "of", "for", "to", "is",
|
||||
"are", "that", "this", "with", "not", "by", "on", "at", "from",
|
||||
"as", "be", "it", "no", "but", "has", "have", "do", "does"}
|
||||
search_words -= stop_words
|
||||
|
||||
scored = []
|
||||
for idea in all_ideas:
|
||||
idea_text = (idea["title"] + " " + idea["description"]).lower()
|
||||
idea_words = set(idea_text.split())
|
||||
overlap = len(search_words & idea_words)
|
||||
if overlap >= 1:
|
||||
scored.append((overlap, idea))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [item for _, item in scored[:limit]]
|
||||
|
||||
def _rfc_foundations(self, category: str, limit: int = 10) -> list[tuple[str, int]]:
|
||||
"""Get most-referenced RFCs, optionally filtered by category."""
|
||||
top_refs = self.db.top_referenced(ref_type="rfc", limit=limit * 2)
|
||||
if not category:
|
||||
return [(ref_id, count) for ref_id, count, _ in top_refs[:limit]]
|
||||
|
||||
# Filter to RFCs referenced by drafts in this category
|
||||
category_lower = category.lower()
|
||||
pairs = self.db.drafts_with_ratings(limit=500)
|
||||
category_drafts = set()
|
||||
for draft, rating in pairs:
|
||||
for cat in rating.categories:
|
||||
if category_lower in cat.lower():
|
||||
category_drafts.add(draft.name)
|
||||
|
||||
if not category_drafts:
|
||||
return [(ref_id, count) for ref_id, count, _ in top_refs[:limit]]
|
||||
|
||||
filtered = []
|
||||
for ref_id, count, draft_names in top_refs:
|
||||
cat_count = sum(1 for d in draft_names if d in category_drafts)
|
||||
if cat_count > 0:
|
||||
filtered.append((ref_id, cat_count))
|
||||
|
||||
filtered.sort(key=lambda x: x[1], reverse=True)
|
||||
return filtered[:limit]
|
||||
|
||||
def _similar_drafts(self, gap_desc: str, limit: int = 8) -> list[tuple[str, float]]:
|
||||
"""Find semantically similar existing drafts via embeddings."""
|
||||
all_embeddings = self.db.all_embeddings()
|
||||
if not all_embeddings:
|
||||
return []
|
||||
|
||||
# Try to embed the gap description via Ollama
|
||||
try:
|
||||
import ollama as ollama_lib
|
||||
client = ollama_lib.Client(host=self.config.ollama_url)
|
||||
resp = client.embed(
|
||||
model=self.config.ollama_embed_model,
|
||||
input=gap_desc[:8000],
|
||||
)
|
||||
gap_vec = np.array(resp["embeddings"][0], dtype=np.float32)
|
||||
except Exception as e:
|
||||
console.print(f"[yellow]Ollama embedding failed, skipping similarity: {e}[/]")
|
||||
return []
|
||||
|
||||
similarities = []
|
||||
for name, vec in all_embeddings.items():
|
||||
sim = _cosine_similarity(gap_vec, vec)
|
||||
similarities.append((name, sim))
|
||||
|
||||
similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
return similarities[:limit]
|
||||
|
||||
def _top_rated_in_category(self, category: str, limit: int = 5) -> list[tuple]:
|
||||
"""Get top-rated drafts in a category."""
|
||||
pairs = self.db.drafts_with_ratings(limit=500)
|
||||
if not category:
|
||||
return [
|
||||
(draft.name, draft.title, rating.composite_score)
|
||||
for draft, rating in pairs[:limit]
|
||||
]
|
||||
|
||||
category_lower = category.lower()
|
||||
matching = []
|
||||
for draft, rating in pairs:
|
||||
for cat in rating.categories:
|
||||
if category_lower in cat.lower():
|
||||
matching.append((draft.name, draft.title, rating.composite_score))
|
||||
break
|
||||
|
||||
return matching[:limit]
|
||||
|
||||
def _wg_context(self) -> str:
|
||||
"""Summarize WG adoption status."""
|
||||
adoption = self.db.draft_adoption_status()
|
||||
wg_counts: dict[str, int] = {}
|
||||
adopted_count = 0
|
||||
for d in adoption:
|
||||
if d["wg_adopted"]:
|
||||
adopted_count += 1
|
||||
wg = d["wg_name"]
|
||||
wg_counts[wg] = wg_counts.get(wg, 0) + 1
|
||||
|
||||
total = len(adoption)
|
||||
if not wg_counts:
|
||||
return f"{total} drafts, none WG-adopted yet."
|
||||
|
||||
top_wgs = sorted(wg_counts.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
wg_lines = ", ".join(f"{wg} ({n})" for wg, n in top_wgs)
|
||||
return f"{total} drafts, {adopted_count} WG-adopted. Top WGs: {wg_lines}"
|
||||
|
||||
def _ecosystem_vision(self) -> str:
|
||||
"""Load ecosystem vision document if it exists."""
|
||||
vision_path = Path(self.config.data_dir) / "reports" / "holistic-agent-ecosystem-draft-outlines.md"
|
||||
if not vision_path.exists():
|
||||
return "(No ecosystem vision document found)"
|
||||
|
||||
text = vision_path.read_text()
|
||||
# Return the pitch section (compact) rather than the full document
|
||||
if "## 8. One-Page Pitch" in text:
|
||||
pitch = text.split("## 8. One-Page Pitch")[1].strip()
|
||||
return pitch[:2000]
|
||||
# Fallback: return the vision summary
|
||||
if "## 1. Vision Summary" in text:
|
||||
parts = text.split("## 1. Vision Summary")[1]
|
||||
if "## 2." in parts:
|
||||
parts = parts.split("## 2.")[0]
|
||||
return parts.strip()[:2000]
|
||||
return text[:2000]
|
||||
|
||||
def _sibling_context(self, gap_topic: str) -> list[dict]:
|
||||
"""Get outlines of sibling drafts from the same family."""
|
||||
# Check all family drafts
|
||||
families = self.db.get_generated_drafts()
|
||||
if not families:
|
||||
return []
|
||||
|
||||
# Find which family this gap_topic belongs to
|
||||
topic_lower = gap_topic.lower()
|
||||
family_name = ""
|
||||
for gd in families:
|
||||
if topic_lower in gd.get("gap_topic", "").lower():
|
||||
family_name = gd.get("family_name", "")
|
||||
break
|
||||
|
||||
if not family_name:
|
||||
return []
|
||||
|
||||
siblings = self.db.get_family_drafts(family_name)
|
||||
result = []
|
||||
for s in siblings:
|
||||
if s.get("gap_topic", "").lower() == topic_lower:
|
||||
continue # Skip self
|
||||
outline = {}
|
||||
if s.get("outline_json"):
|
||||
try:
|
||||
outline = json.loads(s["outline_json"]) if isinstance(s["outline_json"], str) else s["outline_json"]
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
result.append({
|
||||
"role": s.get("family_role", ""),
|
||||
"title": s.get("title", ""),
|
||||
"abstract": s.get("abstract", ""),
|
||||
"outline": outline,
|
||||
})
|
||||
return result
|
||||
Reference in New Issue
Block a user