Gap-to-Draft Pipeline (ietf pipeline): - Context builder assembles ideas, RFC foundations, similar drafts, ecosystem vision - Generator produces outlines + sections using rich context with Claude - Quality gates: novelty (embedding similarity), references, format, self-rating - Family coordinator generates 5-draft ecosystem (AEM/ATD/HITL/AEPB/APAE) - I-D formatter with proper headers, references, 72-char wrapping Living Standards Observatory (ietf observatory): - Source abstraction with IETF + W3C fetchers - 7-step update pipeline: snapshot, fetch, analyze, embed, ideas, gaps, record - Static GitHub Pages dashboard (explorer, gap tracker, timeline) - Weekly CI/CD automation via GitHub Actions Also includes: - 361 drafts (expanded from 260 with 6 new keywords), 403 authors, 1,262 ideas, 12 gaps - Blog series (8 posts planned), reports, arXiv paper figures - Agent team infrastructure (CLAUDE.md, scripts, dev journal) - 5 new DB tables, schema migration, ~15 new query methods Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
260 lines
9.5 KiB
Python
260 lines
9.5 KiB
Python
"""Context builder — assembles rich context for draft generation from DB queries."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
from rich.console import Console
|
|
|
|
from ..config import Config
|
|
from ..db import Database
|
|
|
|
console = Console()
|
|
|
|
|
|
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
dot = np.dot(a, b)
|
|
norm = np.linalg.norm(a) * np.linalg.norm(b)
|
|
if norm == 0:
|
|
return 0.0
|
|
return float(dot / norm)
|
|
|
|
|
|
class ContextBuilder:
|
|
def __init__(self, config: Config, db: Database):
|
|
self.config = config
|
|
self.db = db
|
|
|
|
def build_context(self, gap_topic: str) -> dict:
|
|
"""Assemble full context for a gap topic. All DB queries, zero Claude calls."""
|
|
gap = self._find_gap(gap_topic)
|
|
if not gap:
|
|
console.print(f"[yellow]No gap found matching '{gap_topic}', using topic as-is[/]")
|
|
gap = {
|
|
"id": 0,
|
|
"topic": gap_topic,
|
|
"description": gap_topic,
|
|
"category": "",
|
|
"evidence": "",
|
|
"severity": "medium",
|
|
}
|
|
|
|
ideas = self._convergent_ideas(gap)
|
|
rfcs = self._rfc_foundations(gap.get("category", ""))
|
|
similar = self._similar_drafts(gap["description"])
|
|
top_rated = self._top_rated_in_category(gap.get("category", ""))
|
|
wg_context = self._wg_context()
|
|
ecosystem = self._ecosystem_vision()
|
|
siblings = self._sibling_context(gap_topic)
|
|
|
|
return {
|
|
"gap": gap,
|
|
"convergent_ideas": ideas,
|
|
"rfc_foundations": rfcs,
|
|
"similar_drafts": similar,
|
|
"top_rated": top_rated,
|
|
"wg_context": wg_context,
|
|
"ecosystem_vision": ecosystem,
|
|
"sibling_context": siblings,
|
|
}
|
|
|
|
def _find_gap(self, topic: str) -> dict | None:
|
|
"""Find a gap by topic string (fuzzy match)."""
|
|
gaps = self.db.all_gaps()
|
|
topic_lower = topic.lower()
|
|
# Exact match first
|
|
for g in gaps:
|
|
if g["topic"].lower() == topic_lower:
|
|
return g
|
|
# Substring match
|
|
for g in gaps:
|
|
if topic_lower in g["topic"].lower() or topic_lower in g["description"].lower():
|
|
return g
|
|
# Word overlap match
|
|
topic_words = set(topic_lower.split())
|
|
best = None
|
|
best_score = 0
|
|
for g in gaps:
|
|
gap_words = set(g["topic"].lower().split()) | set(g["description"].lower().split())
|
|
overlap = len(topic_words & gap_words)
|
|
if overlap > best_score:
|
|
best_score = overlap
|
|
best = g
|
|
return best if best_score >= 2 else None
|
|
|
|
def _convergent_ideas(self, gap: dict, limit: int = 20) -> list[dict]:
|
|
"""Find ideas that converge on this gap topic via keyword matching."""
|
|
all_ideas = self.db.all_ideas()
|
|
if not all_ideas:
|
|
return []
|
|
|
|
# Build search terms from gap topic + description
|
|
search_text = (gap["topic"] + " " + gap["description"]).lower()
|
|
search_words = set(search_text.split())
|
|
# Remove common words
|
|
stop_words = {"the", "a", "an", "and", "or", "in", "of", "for", "to", "is",
|
|
"are", "that", "this", "with", "not", "by", "on", "at", "from",
|
|
"as", "be", "it", "no", "but", "has", "have", "do", "does"}
|
|
search_words -= stop_words
|
|
|
|
scored = []
|
|
for idea in all_ideas:
|
|
idea_text = (idea["title"] + " " + idea["description"]).lower()
|
|
idea_words = set(idea_text.split())
|
|
overlap = len(search_words & idea_words)
|
|
if overlap >= 1:
|
|
scored.append((overlap, idea))
|
|
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|
return [item for _, item in scored[:limit]]
|
|
|
|
def _rfc_foundations(self, category: str, limit: int = 10) -> list[tuple[str, int]]:
|
|
"""Get most-referenced RFCs, optionally filtered by category."""
|
|
top_refs = self.db.top_referenced(ref_type="rfc", limit=limit * 2)
|
|
if not category:
|
|
return [(ref_id, count) for ref_id, count, _ in top_refs[:limit]]
|
|
|
|
# Filter to RFCs referenced by drafts in this category
|
|
category_lower = category.lower()
|
|
pairs = self.db.drafts_with_ratings(limit=500)
|
|
category_drafts = set()
|
|
for draft, rating in pairs:
|
|
for cat in rating.categories:
|
|
if category_lower in cat.lower():
|
|
category_drafts.add(draft.name)
|
|
|
|
if not category_drafts:
|
|
return [(ref_id, count) for ref_id, count, _ in top_refs[:limit]]
|
|
|
|
filtered = []
|
|
for ref_id, count, draft_names in top_refs:
|
|
cat_count = sum(1 for d in draft_names if d in category_drafts)
|
|
if cat_count > 0:
|
|
filtered.append((ref_id, cat_count))
|
|
|
|
filtered.sort(key=lambda x: x[1], reverse=True)
|
|
return filtered[:limit]
|
|
|
|
def _similar_drafts(self, gap_desc: str, limit: int = 8) -> list[tuple[str, float]]:
|
|
"""Find semantically similar existing drafts via embeddings."""
|
|
all_embeddings = self.db.all_embeddings()
|
|
if not all_embeddings:
|
|
return []
|
|
|
|
# Try to embed the gap description via Ollama
|
|
try:
|
|
import ollama as ollama_lib
|
|
client = ollama_lib.Client(host=self.config.ollama_url)
|
|
resp = client.embed(
|
|
model=self.config.ollama_embed_model,
|
|
input=gap_desc[:8000],
|
|
)
|
|
gap_vec = np.array(resp["embeddings"][0], dtype=np.float32)
|
|
except Exception as e:
|
|
console.print(f"[yellow]Ollama embedding failed, skipping similarity: {e}[/]")
|
|
return []
|
|
|
|
similarities = []
|
|
for name, vec in all_embeddings.items():
|
|
sim = _cosine_similarity(gap_vec, vec)
|
|
similarities.append((name, sim))
|
|
|
|
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
return similarities[:limit]
|
|
|
|
def _top_rated_in_category(self, category: str, limit: int = 5) -> list[tuple]:
|
|
"""Get top-rated drafts in a category."""
|
|
pairs = self.db.drafts_with_ratings(limit=500)
|
|
if not category:
|
|
return [
|
|
(draft.name, draft.title, rating.composite_score)
|
|
for draft, rating in pairs[:limit]
|
|
]
|
|
|
|
category_lower = category.lower()
|
|
matching = []
|
|
for draft, rating in pairs:
|
|
for cat in rating.categories:
|
|
if category_lower in cat.lower():
|
|
matching.append((draft.name, draft.title, rating.composite_score))
|
|
break
|
|
|
|
return matching[:limit]
|
|
|
|
def _wg_context(self) -> str:
|
|
"""Summarize WG adoption status."""
|
|
adoption = self.db.draft_adoption_status()
|
|
wg_counts: dict[str, int] = {}
|
|
adopted_count = 0
|
|
for d in adoption:
|
|
if d["wg_adopted"]:
|
|
adopted_count += 1
|
|
wg = d["wg_name"]
|
|
wg_counts[wg] = wg_counts.get(wg, 0) + 1
|
|
|
|
total = len(adoption)
|
|
if not wg_counts:
|
|
return f"{total} drafts, none WG-adopted yet."
|
|
|
|
top_wgs = sorted(wg_counts.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
wg_lines = ", ".join(f"{wg} ({n})" for wg, n in top_wgs)
|
|
return f"{total} drafts, {adopted_count} WG-adopted. Top WGs: {wg_lines}"
|
|
|
|
def _ecosystem_vision(self) -> str:
|
|
"""Load ecosystem vision document if it exists."""
|
|
vision_path = Path(self.config.data_dir) / "reports" / "holistic-agent-ecosystem-draft-outlines.md"
|
|
if not vision_path.exists():
|
|
return "(No ecosystem vision document found)"
|
|
|
|
text = vision_path.read_text()
|
|
# Return the pitch section (compact) rather than the full document
|
|
if "## 8. One-Page Pitch" in text:
|
|
pitch = text.split("## 8. One-Page Pitch")[1].strip()
|
|
return pitch[:2000]
|
|
# Fallback: return the vision summary
|
|
if "## 1. Vision Summary" in text:
|
|
parts = text.split("## 1. Vision Summary")[1]
|
|
if "## 2." in parts:
|
|
parts = parts.split("## 2.")[0]
|
|
return parts.strip()[:2000]
|
|
return text[:2000]
|
|
|
|
def _sibling_context(self, gap_topic: str) -> list[dict]:
|
|
"""Get outlines of sibling drafts from the same family."""
|
|
# Check all family drafts
|
|
families = self.db.get_generated_drafts()
|
|
if not families:
|
|
return []
|
|
|
|
# Find which family this gap_topic belongs to
|
|
topic_lower = gap_topic.lower()
|
|
family_name = ""
|
|
for gd in families:
|
|
if topic_lower in gd.get("gap_topic", "").lower():
|
|
family_name = gd.get("family_name", "")
|
|
break
|
|
|
|
if not family_name:
|
|
return []
|
|
|
|
siblings = self.db.get_family_drafts(family_name)
|
|
result = []
|
|
for s in siblings:
|
|
if s.get("gap_topic", "").lower() == topic_lower:
|
|
continue # Skip self
|
|
outline = {}
|
|
if s.get("outline_json"):
|
|
try:
|
|
outline = json.loads(s["outline_json"]) if isinstance(s["outline_json"], str) else s["outline_json"]
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
result.append({
|
|
"role": s.get("family_role", ""),
|
|
"title": s.get("title", ""),
|
|
"abstract": s.get("abstract", ""),
|
|
"outline": outline,
|
|
})
|
|
return result
|