Platform upgrade: semantic search, citations, readiness, tests, Docker
Major features added by 5 parallel agent teams: - Semantic "Ask" (NL queries via FTS5 + embeddings + Claude synthesis) - Global search across drafts, ideas, authors, gaps - REST API expansion (14 endpoints, up from 3) with CSV/JSON export - Citation graph visualization (D3.js, 440 nodes, 2422 edges) - Standards readiness scoring (0-100 composite from 6 factors) - Side-by-side draft comparison view with shared/unique analysis - Annotation system (notes + tags per draft, DB-persisted) - Docker deployment (Dockerfile + docker-compose with Ollama) - Scheduled updates (cron script with log rotation) - Pipeline health dashboard (stage progress bars, cost tracking) - Test suite foundation (54 pytest tests covering DB, models, web data) Fixes: compare_drafts() stubbed→working, get_authors_for_draft() bug, source-aware analysis prompts, config env var overrides + validation, resilient batch error handling with --retry-failed, observatory --dry-run Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -66,6 +66,7 @@ def get_drafts_page(
|
||||
min_score: float = 0.0,
|
||||
sort: str = "score",
|
||||
sort_dir: str = "desc",
|
||||
source: str = "",
|
||||
) -> dict:
|
||||
"""Return a paginated, filtered list of drafts with ratings.
|
||||
|
||||
@@ -80,6 +81,8 @@ def get_drafts_page(
|
||||
continue
|
||||
if category and category not in rating.categories:
|
||||
continue
|
||||
if source and draft.source != source:
|
||||
continue
|
||||
if search:
|
||||
haystack = f"{draft.name} {draft.title} {rating.summary}".lower()
|
||||
if not all(w in haystack for w in search.lower().split()):
|
||||
@@ -96,6 +99,9 @@ def get_drafts_page(
|
||||
"relevance": lambda p: p[1].relevance,
|
||||
"overlap": lambda p: p[1].overlap,
|
||||
"momentum": lambda p: p[1].momentum,
|
||||
"readiness": lambda p: (1.0 if p[0].name.startswith("draft-ietf-") else 0.0) * 0.25 +
|
||||
min(int(p[0].rev or "0") / 5.0, 1.0) * 0.15 +
|
||||
((p[1].momentum - 1) / 4.0) * 0.15,
|
||||
}
|
||||
key_fn = sort_keys.get(sort, sort_keys["score"])
|
||||
reverse = sort_dir == "desc"
|
||||
@@ -107,15 +113,23 @@ def get_drafts_page(
|
||||
start = (page - 1) * per_page
|
||||
page_items = filtered[start : start + per_page]
|
||||
|
||||
# Pre-compute readiness for page items (lightweight version)
|
||||
from ietf_analyzer.readiness import compute_readiness
|
||||
readiness_cache = {}
|
||||
for draft, rating in page_items:
|
||||
readiness_cache[draft.name] = compute_readiness(db, draft.name)
|
||||
|
||||
drafts = []
|
||||
for draft, rating in page_items:
|
||||
r_score = readiness_cache.get(draft.name, {}).get("score", 0)
|
||||
drafts.append({
|
||||
"name": draft.name,
|
||||
"title": draft.title,
|
||||
"date": draft.date,
|
||||
"url": draft.datatracker_url,
|
||||
"url": draft.source_url if draft.source != "ietf" else draft.datatracker_url,
|
||||
"pages": draft.pages or 0,
|
||||
"group": draft.group or "individual",
|
||||
"source": draft.source or "ietf",
|
||||
"score": round(rating.composite_score, 2),
|
||||
"novelty": rating.novelty,
|
||||
"maturity": rating.maturity,
|
||||
@@ -124,6 +138,7 @@ def get_drafts_page(
|
||||
"relevance": rating.relevance,
|
||||
"categories": rating.categories,
|
||||
"summary": rating.summary,
|
||||
"readiness": r_score,
|
||||
})
|
||||
|
||||
return {
|
||||
@@ -185,6 +200,14 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
|
||||
"categories": rating.categories,
|
||||
}
|
||||
|
||||
# Readiness score
|
||||
from ietf_analyzer.readiness import compute_readiness
|
||||
result["readiness"] = compute_readiness(db, name)
|
||||
|
||||
# Annotation
|
||||
annotation = db.get_annotation(name)
|
||||
result["annotation"] = annotation
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -253,8 +276,11 @@ def get_ideas_by_type(db: Database) -> dict:
|
||||
|
||||
|
||||
def get_all_gaps(db: Database) -> list[dict]:
|
||||
"""Return all gap analysis results."""
|
||||
return db.all_gaps()
|
||||
"""Return all gap analysis results, sorted by severity (critical first)."""
|
||||
_sev_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
|
||||
gaps = db.all_gaps()
|
||||
gaps.sort(key=lambda g: _sev_order.get(g.get("severity", "low"), 99))
|
||||
return gaps
|
||||
|
||||
|
||||
def get_gap_detail(db: Database, gap_id: int) -> dict | None:
|
||||
@@ -775,17 +801,252 @@ def get_monitor_status(db: Database) -> dict:
|
||||
"""Return monitoring status data for dashboard."""
|
||||
runs = db.get_monitor_runs(limit=20)
|
||||
last = runs[0] if runs else None
|
||||
total_drafts = db.count_drafts()
|
||||
rated_count = len(db.drafts_with_ratings(limit=10000))
|
||||
unrated = len(db.unrated_drafts(limit=9999))
|
||||
unembedded = len(db.drafts_without_embeddings(limit=9999))
|
||||
embedded_count = total_drafts - unembedded
|
||||
no_ideas = len(db.drafts_without_ideas(limit=9999))
|
||||
ideas_count = total_drafts - no_ideas
|
||||
idea_total = db.idea_count()
|
||||
gap_count = len(db.all_gaps())
|
||||
input_tok, output_tok = db.total_tokens_used()
|
||||
|
||||
# Estimate cost (Sonnet pricing: $3/M input, $15/M output)
|
||||
est_cost = (input_tok * 3.0 / 1_000_000) + (output_tok * 15.0 / 1_000_000)
|
||||
|
||||
return {
|
||||
"last_run": last,
|
||||
"runs": runs,
|
||||
"unprocessed": {"unrated": unrated, "unembedded": unembedded, "no_ideas": no_ideas},
|
||||
"total_runs": len(runs),
|
||||
"pipeline": {
|
||||
"total_drafts": total_drafts,
|
||||
"rated": rated_count,
|
||||
"embedded": embedded_count,
|
||||
"with_ideas": ideas_count,
|
||||
"idea_total": idea_total,
|
||||
"gap_count": gap_count,
|
||||
},
|
||||
"cost": {
|
||||
"input_tokens": input_tok,
|
||||
"output_tokens": output_tok,
|
||||
"estimated_usd": round(est_cost, 2),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
|
||||
"""Return citation network data for force-directed graph.
|
||||
|
||||
Returns {nodes: [{id, type, title, influence, ...}],
|
||||
edges: [{source, target}],
|
||||
stats: {node_count, edge_count, ...}}
|
||||
"""
|
||||
# Get all references
|
||||
rows = db.conn.execute(
|
||||
"SELECT draft_name, ref_type, ref_id FROM draft_refs"
|
||||
).fetchall()
|
||||
|
||||
# Count in-degree for each referenced item
|
||||
in_degree: dict[str, int] = Counter()
|
||||
edges_raw = []
|
||||
for r in rows:
|
||||
ref_key = f"{r['ref_type']}:{r['ref_id']}"
|
||||
in_degree[ref_key] += 1
|
||||
edges_raw.append((r["draft_name"], ref_key))
|
||||
|
||||
# Also count drafts as source nodes
|
||||
draft_out: dict[str, int] = Counter()
|
||||
for draft_name, _ in edges_raw:
|
||||
draft_out[draft_name] += 1
|
||||
|
||||
# Get draft titles for labeling
|
||||
draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall()
|
||||
draft_titles = {r["name"]: r["title"] for r in draft_rows}
|
||||
|
||||
# Get rating categories for draft coloring
|
||||
rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall()
|
||||
draft_cats = {}
|
||||
for r in rating_rows:
|
||||
try:
|
||||
cats = json.loads(r["categories"]) if r["categories"] else []
|
||||
draft_cats[r["draft_name"]] = cats[0] if cats else "Other"
|
||||
except Exception:
|
||||
draft_cats[r["draft_name"]] = "Other"
|
||||
|
||||
# Filter: keep RFCs with min_refs+ references and all drafts that reference them
|
||||
top_refs = {k: v for k, v in in_degree.items() if v >= min_refs}
|
||||
|
||||
# Build node set
|
||||
node_set = set()
|
||||
filtered_edges = []
|
||||
for draft_name, ref_key in edges_raw:
|
||||
if ref_key in top_refs:
|
||||
node_set.add(draft_name)
|
||||
node_set.add(ref_key)
|
||||
filtered_edges.append({"source": draft_name, "target": ref_key})
|
||||
|
||||
# Limit to ~200 nodes max for readability
|
||||
if len(node_set) > 250:
|
||||
# Keep only refs with higher in-degree
|
||||
sorted_refs = sorted(top_refs.items(), key=lambda x: x[1], reverse=True)
|
||||
keep_refs = set(k for k, _ in sorted_refs[:80])
|
||||
node_set = set()
|
||||
filtered_edges = []
|
||||
for draft_name, ref_key in edges_raw:
|
||||
if ref_key in keep_refs:
|
||||
node_set.add(draft_name)
|
||||
node_set.add(ref_key)
|
||||
filtered_edges.append({"source": draft_name, "target": ref_key})
|
||||
|
||||
# Build nodes
|
||||
nodes = []
|
||||
for nid in node_set:
|
||||
if ":" in nid and not nid.startswith("draft-"):
|
||||
# It's a reference node (rfc:1234, bcp:14, etc.)
|
||||
ref_type, ref_id = nid.split(":", 1)
|
||||
influence = in_degree.get(nid, 0)
|
||||
if ref_type == "rfc":
|
||||
try:
|
||||
title = f"RFC {int(ref_id)}"
|
||||
except ValueError:
|
||||
title = f"RFC {ref_id}"
|
||||
else:
|
||||
title = f"{ref_type.upper()} {ref_id}"
|
||||
nodes.append({
|
||||
"id": nid,
|
||||
"type": ref_type,
|
||||
"title": title,
|
||||
"influence": influence,
|
||||
"ref_id": ref_id,
|
||||
})
|
||||
else:
|
||||
# It's a draft node
|
||||
influence = in_degree.get(nid, 0) + draft_out.get(nid, 0)
|
||||
nodes.append({
|
||||
"id": nid,
|
||||
"type": "draft",
|
||||
"title": draft_titles.get(nid, nid),
|
||||
"influence": draft_out.get(nid, 0),
|
||||
"category": draft_cats.get(nid, "Other"),
|
||||
})
|
||||
|
||||
# Stats
|
||||
rfc_count = sum(1 for n in nodes if n["type"] == "rfc")
|
||||
draft_count = sum(1 for n in nodes if n["type"] == "draft")
|
||||
|
||||
return {
|
||||
"nodes": nodes,
|
||||
"edges": filtered_edges,
|
||||
"stats": {
|
||||
"node_count": len(nodes),
|
||||
"edge_count": len(filtered_edges),
|
||||
"rfc_count": rfc_count,
|
||||
"draft_count": draft_count,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def global_search(db: Database, query: str) -> dict:
|
||||
"""Search across drafts (FTS5), ideas, authors, and gaps.
|
||||
|
||||
Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
|
||||
"""
|
||||
results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []}
|
||||
if not query or not query.strip():
|
||||
return results
|
||||
|
||||
q = query.strip()
|
||||
|
||||
# 1. Drafts via FTS5
|
||||
try:
|
||||
fts_query = " ".join(f'"{w}"' for w in q.split() if w)
|
||||
rows = db.conn.execute(
|
||||
"""SELECT d.name, d.title, d.abstract, d.time, d."group"
|
||||
FROM drafts d
|
||||
JOIN drafts_fts f ON d.rowid = f.rowid
|
||||
WHERE drafts_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT 50""",
|
||||
(fts_query,),
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
results["drafts"].append({
|
||||
"name": r["name"],
|
||||
"title": r["title"],
|
||||
"abstract": (r["abstract"] or "")[:200],
|
||||
"date": r["time"],
|
||||
"group": r["group"] or "individual",
|
||||
})
|
||||
except Exception:
|
||||
# FTS5 match can fail on certain query syntax; fall back to LIKE
|
||||
like = f"%{q}%"
|
||||
rows = db.conn.execute(
|
||||
"""SELECT name, title, abstract, time, "group" FROM drafts
|
||||
WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ?
|
||||
LIMIT 50""",
|
||||
(like, like, like),
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
results["drafts"].append({
|
||||
"name": r["name"],
|
||||
"title": r["title"],
|
||||
"abstract": (r["abstract"] or "")[:200],
|
||||
"date": r["time"],
|
||||
"group": r["group"] or "individual",
|
||||
})
|
||||
|
||||
# 2. Ideas via LIKE
|
||||
like = f"%{q}%"
|
||||
rows = db.conn.execute(
|
||||
"""SELECT id, title, description, idea_type, draft_name FROM ideas
|
||||
WHERE title LIKE ? OR description LIKE ?
|
||||
ORDER BY id LIMIT 50""",
|
||||
(like, like),
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
results["ideas"].append({
|
||||
"id": r["id"],
|
||||
"title": r["title"],
|
||||
"description": (r["description"] or "")[:200],
|
||||
"type": r["idea_type"],
|
||||
"draft_name": r["draft_name"],
|
||||
})
|
||||
|
||||
# 3. Authors via LIKE
|
||||
rows = db.conn.execute(
|
||||
"""SELECT person_id, name, affiliation FROM authors
|
||||
WHERE name LIKE ? OR affiliation LIKE ?
|
||||
ORDER BY name LIMIT 50""",
|
||||
(like, like),
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
results["authors"].append({
|
||||
"person_id": r["person_id"],
|
||||
"name": r["name"],
|
||||
"affiliation": r["affiliation"] or "",
|
||||
})
|
||||
|
||||
# 4. Gaps via LIKE
|
||||
rows = db.conn.execute(
|
||||
"""SELECT id, topic, description, category, severity FROM gaps
|
||||
WHERE topic LIKE ? OR description LIKE ?
|
||||
ORDER BY id LIMIT 50""",
|
||||
(like, like),
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
results["gaps"].append({
|
||||
"id": r["id"],
|
||||
"topic": r["topic"],
|
||||
"description": (r["description"] or "")[:200],
|
||||
"category": r["category"],
|
||||
"severity": r["severity"],
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_landscape_tsne(db: Database) -> list[dict]:
|
||||
"""Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].
|
||||
|
||||
@@ -829,3 +1090,116 @@ def get_landscape_tsne(db: Database) -> list[dict]:
|
||||
"score": round(r.composite_score, 2),
|
||||
})
|
||||
return result
|
||||
|
||||
|
||||
def get_comparison_data(db: Database, names: list[str]) -> dict | None:
|
||||
"""Get comparison data for a list of drafts.
|
||||
|
||||
Returns {
|
||||
drafts: [{name, title, abstract, rating, ideas, refs, ...}],
|
||||
shared_ideas: [{title, drafts: [name,...]}],
|
||||
unique_ideas: {name: [{title, description}]},
|
||||
shared_refs: [{type, id, drafts: [name,...]}],
|
||||
unique_refs: {name: [{type, id}]},
|
||||
similarities: [{a, b, similarity}],
|
||||
comparison_text: str | None,
|
||||
}
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
drafts_data = []
|
||||
all_ideas: dict[str, list[dict]] = {}
|
||||
all_refs: dict[str, list[tuple[str, str]]] = {}
|
||||
|
||||
for name in names:
|
||||
detail = get_draft_detail(db, name)
|
||||
if not detail:
|
||||
continue
|
||||
drafts_data.append(detail)
|
||||
all_ideas[name] = detail.get("ideas", [])
|
||||
all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])]
|
||||
|
||||
if len(drafts_data) < 2:
|
||||
return None
|
||||
|
||||
# Find shared vs unique ideas (by title similarity)
|
||||
idea_title_drafts: dict[str, list[str]] = {}
|
||||
for name, ideas in all_ideas.items():
|
||||
for idea in ideas:
|
||||
title_lower = idea["title"].lower().strip()
|
||||
if title_lower not in idea_title_drafts:
|
||||
idea_title_drafts[title_lower] = []
|
||||
idea_title_drafts[title_lower].append(name)
|
||||
|
||||
shared_ideas = [
|
||||
{"title": title, "drafts": draft_list}
|
||||
for title, draft_list in idea_title_drafts.items()
|
||||
if len(set(draft_list)) > 1
|
||||
]
|
||||
unique_ideas: dict[str, list[dict]] = {}
|
||||
for name, ideas in all_ideas.items():
|
||||
unique = []
|
||||
for idea in ideas:
|
||||
title_lower = idea["title"].lower().strip()
|
||||
if len(set(idea_title_drafts.get(title_lower, []))) <= 1:
|
||||
unique.append({"title": idea["title"], "description": idea.get("description", "")})
|
||||
unique_ideas[name] = unique
|
||||
|
||||
# Find shared vs unique references
|
||||
ref_drafts: dict[tuple[str, str], list[str]] = {}
|
||||
for name, refs in all_refs.items():
|
||||
for ref in refs:
|
||||
if ref not in ref_drafts:
|
||||
ref_drafts[ref] = []
|
||||
ref_drafts[ref].append(name)
|
||||
|
||||
shared_refs = [
|
||||
{"type": ref[0], "id": ref[1], "drafts": draft_list}
|
||||
for ref, draft_list in ref_drafts.items()
|
||||
if len(set(draft_list)) > 1
|
||||
]
|
||||
unique_refs: dict[str, list[dict]] = {}
|
||||
for name, refs in all_refs.items():
|
||||
unique = []
|
||||
for ref in refs:
|
||||
if len(set(ref_drafts.get(ref, []))) <= 1:
|
||||
unique.append({"type": ref[0], "id": ref[1]})
|
||||
unique_refs[name] = unique
|
||||
|
||||
# Pairwise embedding similarities
|
||||
embeddings = db.all_embeddings()
|
||||
similarities = []
|
||||
valid_names = [d["name"] for d in drafts_data]
|
||||
for i in range(len(valid_names)):
|
||||
for j in range(i + 1, len(valid_names)):
|
||||
a, b = valid_names[i], valid_names[j]
|
||||
if a in embeddings and b in embeddings:
|
||||
vec_a = embeddings[a]
|
||||
vec_b = embeddings[b]
|
||||
dot = np.dot(vec_a, vec_b)
|
||||
norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
|
||||
sim = float(dot / norm) if norm > 0 else 0.0
|
||||
similarities.append({"a": a, "b": b, "similarity": round(sim, 4)})
|
||||
|
||||
return {
|
||||
"drafts": drafts_data,
|
||||
"shared_ideas": shared_ideas,
|
||||
"unique_ideas": unique_ideas,
|
||||
"shared_refs": shared_refs,
|
||||
"unique_refs": unique_refs,
|
||||
"similarities": similarities,
|
||||
"comparison_text": None,
|
||||
}
|
||||
|
||||
|
||||
def get_ask_data(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
|
||||
"""Run hybrid search + Claude synthesis for a question.
|
||||
|
||||
Returns {answer: str, sources: [{name, title, similarity, excerpt}]}.
|
||||
"""
|
||||
from ietf_analyzer.config import Config
|
||||
from ietf_analyzer.search import HybridSearch
|
||||
|
||||
config = Config.load()
|
||||
searcher = HybridSearch(config, db)
|
||||
return searcher.ask(question, top_k=top_k, cheap=cheap)
|
||||
|
||||
Reference in New Issue
Block a user