Platform upgrade: semantic search, citations, readiness, tests, Docker

Major features added by 5 parallel agent teams:
- Semantic "Ask" (NL queries via FTS5 + embeddings + Claude synthesis)
- Global search across drafts, ideas, authors, gaps
- REST API expansion (14 endpoints, up from 3) with CSV/JSON export
- Citation graph visualization (D3.js, 440 nodes, 2422 edges)
- Standards readiness scoring (0-100 composite from 6 factors)
- Side-by-side draft comparison view with shared/unique analysis
- Annotation system (notes + tags per draft, DB-persisted)
- Docker deployment (Dockerfile + docker-compose with Ollama)
- Scheduled updates (cron script with log rotation)
- Pipeline health dashboard (stage progress bars, cost tracking)
- Test suite foundation (54 pytest tests covering DB, models, web data)

Fixes: compare_drafts() stubbed→working, get_authors_for_draft() bug,
source-aware analysis prompts, config env var overrides + validation,
resilient batch error handling with --retry-failed, observatory --dry-run

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-07 20:52:56 +01:00
parent da2a989744
commit 757b781c67
33 changed files with 4253 additions and 170 deletions

View File

@@ -66,6 +66,7 @@ def get_drafts_page(
min_score: float = 0.0,
sort: str = "score",
sort_dir: str = "desc",
source: str = "",
) -> dict:
"""Return a paginated, filtered list of drafts with ratings.
@@ -80,6 +81,8 @@ def get_drafts_page(
continue
if category and category not in rating.categories:
continue
if source and draft.source != source:
continue
if search:
haystack = f"{draft.name} {draft.title} {rating.summary}".lower()
if not all(w in haystack for w in search.lower().split()):
@@ -96,6 +99,9 @@ def get_drafts_page(
"relevance": lambda p: p[1].relevance,
"overlap": lambda p: p[1].overlap,
"momentum": lambda p: p[1].momentum,
"readiness": lambda p: (1.0 if p[0].name.startswith("draft-ietf-") else 0.0) * 0.25 +
min(int(p[0].rev or "0") / 5.0, 1.0) * 0.15 +
((p[1].momentum - 1) / 4.0) * 0.15,
}
key_fn = sort_keys.get(sort, sort_keys["score"])
reverse = sort_dir == "desc"
@@ -107,15 +113,23 @@ def get_drafts_page(
start = (page - 1) * per_page
page_items = filtered[start : start + per_page]
# Pre-compute readiness for page items (lightweight version)
from ietf_analyzer.readiness import compute_readiness
readiness_cache = {}
for draft, rating in page_items:
readiness_cache[draft.name] = compute_readiness(db, draft.name)
drafts = []
for draft, rating in page_items:
r_score = readiness_cache.get(draft.name, {}).get("score", 0)
drafts.append({
"name": draft.name,
"title": draft.title,
"date": draft.date,
"url": draft.datatracker_url,
"url": draft.source_url if draft.source != "ietf" else draft.datatracker_url,
"pages": draft.pages or 0,
"group": draft.group or "individual",
"source": draft.source or "ietf",
"score": round(rating.composite_score, 2),
"novelty": rating.novelty,
"maturity": rating.maturity,
@@ -124,6 +138,7 @@ def get_drafts_page(
"relevance": rating.relevance,
"categories": rating.categories,
"summary": rating.summary,
"readiness": r_score,
})
return {
@@ -185,6 +200,14 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
"categories": rating.categories,
}
# Readiness score
from ietf_analyzer.readiness import compute_readiness
result["readiness"] = compute_readiness(db, name)
# Annotation
annotation = db.get_annotation(name)
result["annotation"] = annotation
return result
@@ -253,8 +276,11 @@ def get_ideas_by_type(db: Database) -> dict:
def get_all_gaps(db: Database) -> list[dict]:
"""Return all gap analysis results."""
return db.all_gaps()
"""Return all gap analysis results, sorted by severity (critical first)."""
_sev_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
gaps = db.all_gaps()
gaps.sort(key=lambda g: _sev_order.get(g.get("severity", "low"), 99))
return gaps
def get_gap_detail(db: Database, gap_id: int) -> dict | None:
@@ -775,17 +801,252 @@ def get_monitor_status(db: Database) -> dict:
"""Return monitoring status data for dashboard."""
runs = db.get_monitor_runs(limit=20)
last = runs[0] if runs else None
total_drafts = db.count_drafts()
rated_count = len(db.drafts_with_ratings(limit=10000))
unrated = len(db.unrated_drafts(limit=9999))
unembedded = len(db.drafts_without_embeddings(limit=9999))
embedded_count = total_drafts - unembedded
no_ideas = len(db.drafts_without_ideas(limit=9999))
ideas_count = total_drafts - no_ideas
idea_total = db.idea_count()
gap_count = len(db.all_gaps())
input_tok, output_tok = db.total_tokens_used()
# Estimate cost (Sonnet pricing: $3/M input, $15/M output)
est_cost = (input_tok * 3.0 / 1_000_000) + (output_tok * 15.0 / 1_000_000)
return {
"last_run": last,
"runs": runs,
"unprocessed": {"unrated": unrated, "unembedded": unembedded, "no_ideas": no_ideas},
"total_runs": len(runs),
"pipeline": {
"total_drafts": total_drafts,
"rated": rated_count,
"embedded": embedded_count,
"with_ideas": ideas_count,
"idea_total": idea_total,
"gap_count": gap_count,
},
"cost": {
"input_tokens": input_tok,
"output_tokens": output_tok,
"estimated_usd": round(est_cost, 2),
},
}
def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
"""Return citation network data for force-directed graph.
Returns {nodes: [{id, type, title, influence, ...}],
edges: [{source, target}],
stats: {node_count, edge_count, ...}}
"""
# Get all references
rows = db.conn.execute(
"SELECT draft_name, ref_type, ref_id FROM draft_refs"
).fetchall()
# Count in-degree for each referenced item
in_degree: dict[str, int] = Counter()
edges_raw = []
for r in rows:
ref_key = f"{r['ref_type']}:{r['ref_id']}"
in_degree[ref_key] += 1
edges_raw.append((r["draft_name"], ref_key))
# Also count drafts as source nodes
draft_out: dict[str, int] = Counter()
for draft_name, _ in edges_raw:
draft_out[draft_name] += 1
# Get draft titles for labeling
draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall()
draft_titles = {r["name"]: r["title"] for r in draft_rows}
# Get rating categories for draft coloring
rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall()
draft_cats = {}
for r in rating_rows:
try:
cats = json.loads(r["categories"]) if r["categories"] else []
draft_cats[r["draft_name"]] = cats[0] if cats else "Other"
except Exception:
draft_cats[r["draft_name"]] = "Other"
# Filter: keep RFCs with min_refs+ references and all drafts that reference them
top_refs = {k: v for k, v in in_degree.items() if v >= min_refs}
# Build node set
node_set = set()
filtered_edges = []
for draft_name, ref_key in edges_raw:
if ref_key in top_refs:
node_set.add(draft_name)
node_set.add(ref_key)
filtered_edges.append({"source": draft_name, "target": ref_key})
# Limit to ~200 nodes max for readability
if len(node_set) > 250:
# Keep only refs with higher in-degree
sorted_refs = sorted(top_refs.items(), key=lambda x: x[1], reverse=True)
keep_refs = set(k for k, _ in sorted_refs[:80])
node_set = set()
filtered_edges = []
for draft_name, ref_key in edges_raw:
if ref_key in keep_refs:
node_set.add(draft_name)
node_set.add(ref_key)
filtered_edges.append({"source": draft_name, "target": ref_key})
# Build nodes
nodes = []
for nid in node_set:
if ":" in nid and not nid.startswith("draft-"):
# It's a reference node (rfc:1234, bcp:14, etc.)
ref_type, ref_id = nid.split(":", 1)
influence = in_degree.get(nid, 0)
if ref_type == "rfc":
try:
title = f"RFC {int(ref_id)}"
except ValueError:
title = f"RFC {ref_id}"
else:
title = f"{ref_type.upper()} {ref_id}"
nodes.append({
"id": nid,
"type": ref_type,
"title": title,
"influence": influence,
"ref_id": ref_id,
})
else:
# It's a draft node
influence = in_degree.get(nid, 0) + draft_out.get(nid, 0)
nodes.append({
"id": nid,
"type": "draft",
"title": draft_titles.get(nid, nid),
"influence": draft_out.get(nid, 0),
"category": draft_cats.get(nid, "Other"),
})
# Stats
rfc_count = sum(1 for n in nodes if n["type"] == "rfc")
draft_count = sum(1 for n in nodes if n["type"] == "draft")
return {
"nodes": nodes,
"edges": filtered_edges,
"stats": {
"node_count": len(nodes),
"edge_count": len(filtered_edges),
"rfc_count": rfc_count,
"draft_count": draft_count,
},
}
def global_search(db: Database, query: str) -> dict:
"""Search across drafts (FTS5), ideas, authors, and gaps.
Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
"""
results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []}
if not query or not query.strip():
return results
q = query.strip()
# 1. Drafts via FTS5
try:
fts_query = " ".join(f'"{w}"' for w in q.split() if w)
rows = db.conn.execute(
"""SELECT d.name, d.title, d.abstract, d.time, d."group"
FROM drafts d
JOIN drafts_fts f ON d.rowid = f.rowid
WHERE drafts_fts MATCH ?
ORDER BY rank
LIMIT 50""",
(fts_query,),
).fetchall()
for r in rows:
results["drafts"].append({
"name": r["name"],
"title": r["title"],
"abstract": (r["abstract"] or "")[:200],
"date": r["time"],
"group": r["group"] or "individual",
})
except Exception:
# FTS5 match can fail on certain query syntax; fall back to LIKE
like = f"%{q}%"
rows = db.conn.execute(
"""SELECT name, title, abstract, time, "group" FROM drafts
WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ?
LIMIT 50""",
(like, like, like),
).fetchall()
for r in rows:
results["drafts"].append({
"name": r["name"],
"title": r["title"],
"abstract": (r["abstract"] or "")[:200],
"date": r["time"],
"group": r["group"] or "individual",
})
# 2. Ideas via LIKE
like = f"%{q}%"
rows = db.conn.execute(
"""SELECT id, title, description, idea_type, draft_name FROM ideas
WHERE title LIKE ? OR description LIKE ?
ORDER BY id LIMIT 50""",
(like, like),
).fetchall()
for r in rows:
results["ideas"].append({
"id": r["id"],
"title": r["title"],
"description": (r["description"] or "")[:200],
"type": r["idea_type"],
"draft_name": r["draft_name"],
})
# 3. Authors via LIKE
rows = db.conn.execute(
"""SELECT person_id, name, affiliation FROM authors
WHERE name LIKE ? OR affiliation LIKE ?
ORDER BY name LIMIT 50""",
(like, like),
).fetchall()
for r in rows:
results["authors"].append({
"person_id": r["person_id"],
"name": r["name"],
"affiliation": r["affiliation"] or "",
})
# 4. Gaps via LIKE
rows = db.conn.execute(
"""SELECT id, topic, description, category, severity FROM gaps
WHERE topic LIKE ? OR description LIKE ?
ORDER BY id LIMIT 50""",
(like, like),
).fetchall()
for r in rows:
results["gaps"].append({
"id": r["id"],
"topic": r["topic"],
"description": (r["description"] or "")[:200],
"category": r["category"],
"severity": r["severity"],
})
return results
def get_landscape_tsne(db: Database) -> list[dict]:
"""Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].
@@ -829,3 +1090,116 @@ def get_landscape_tsne(db: Database) -> list[dict]:
"score": round(r.composite_score, 2),
})
return result
def get_comparison_data(db: Database, names: list[str]) -> dict | None:
"""Get comparison data for a list of drafts.
Returns {
drafts: [{name, title, abstract, rating, ideas, refs, ...}],
shared_ideas: [{title, drafts: [name,...]}],
unique_ideas: {name: [{title, description}]},
shared_refs: [{type, id, drafts: [name,...]}],
unique_refs: {name: [{type, id}]},
similarities: [{a, b, similarity}],
comparison_text: str | None,
}
"""
import numpy as np
drafts_data = []
all_ideas: dict[str, list[dict]] = {}
all_refs: dict[str, list[tuple[str, str]]] = {}
for name in names:
detail = get_draft_detail(db, name)
if not detail:
continue
drafts_data.append(detail)
all_ideas[name] = detail.get("ideas", [])
all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])]
if len(drafts_data) < 2:
return None
# Find shared vs unique ideas (by title similarity)
idea_title_drafts: dict[str, list[str]] = {}
for name, ideas in all_ideas.items():
for idea in ideas:
title_lower = idea["title"].lower().strip()
if title_lower not in idea_title_drafts:
idea_title_drafts[title_lower] = []
idea_title_drafts[title_lower].append(name)
shared_ideas = [
{"title": title, "drafts": draft_list}
for title, draft_list in idea_title_drafts.items()
if len(set(draft_list)) > 1
]
unique_ideas: dict[str, list[dict]] = {}
for name, ideas in all_ideas.items():
unique = []
for idea in ideas:
title_lower = idea["title"].lower().strip()
if len(set(idea_title_drafts.get(title_lower, []))) <= 1:
unique.append({"title": idea["title"], "description": idea.get("description", "")})
unique_ideas[name] = unique
# Find shared vs unique references
ref_drafts: dict[tuple[str, str], list[str]] = {}
for name, refs in all_refs.items():
for ref in refs:
if ref not in ref_drafts:
ref_drafts[ref] = []
ref_drafts[ref].append(name)
shared_refs = [
{"type": ref[0], "id": ref[1], "drafts": draft_list}
for ref, draft_list in ref_drafts.items()
if len(set(draft_list)) > 1
]
unique_refs: dict[str, list[dict]] = {}
for name, refs in all_refs.items():
unique = []
for ref in refs:
if len(set(ref_drafts.get(ref, []))) <= 1:
unique.append({"type": ref[0], "id": ref[1]})
unique_refs[name] = unique
# Pairwise embedding similarities
embeddings = db.all_embeddings()
similarities = []
valid_names = [d["name"] for d in drafts_data]
for i in range(len(valid_names)):
for j in range(i + 1, len(valid_names)):
a, b = valid_names[i], valid_names[j]
if a in embeddings and b in embeddings:
vec_a = embeddings[a]
vec_b = embeddings[b]
dot = np.dot(vec_a, vec_b)
norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
sim = float(dot / norm) if norm > 0 else 0.0
similarities.append({"a": a, "b": b, "similarity": round(sim, 4)})
return {
"drafts": drafts_data,
"shared_ideas": shared_ideas,
"unique_ideas": unique_ideas,
"shared_refs": shared_refs,
"unique_refs": unique_refs,
"similarities": similarities,
"comparison_text": None,
}
def get_ask_data(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
"""Run hybrid search + Claude synthesis for a question.
Returns {answer: str, sources: [{name, title, similarity, excerpt}]}.
"""
from ietf_analyzer.config import Config
from ietf_analyzer.search import HybridSearch
config = Config.load()
searcher = HybridSearch(config, db)
return searcher.ask(question, top_k=top_k, cheap=cheap)