Platform upgrade: semantic search, citations, readiness, tests, Docker

Major features added by 5 parallel agent teams: - Semantic "Ask" (NL queries via FTS5 + embeddings + Claude synthesis) - Global search across drafts, ideas, authors, gaps - REST API expansion (14 endpoints, up from 3) with CSV/JSON export - Citation graph visualization (D3.js, 440 nodes, 2422 edges) - Standards readiness scoring (0-100 composite from 6 factors) - Side-by-side draft comparison view with shared/unique analysis - Annotation system (notes + tags per draft, DB-persisted) - Docker deployment (Dockerfile + docker-compose with Ollama) - Scheduled updates (cron script with log rotation) - Pipeline health dashboard (stage progress bars, cost tracking) - Test suite foundation (54 pytest tests covering DB, models, web data) Fixes: compare_drafts() stubbed→working, get_authors_for_draft() bug, source-aware analysis prompts, config env var overrides + validation, resilient batch error handling with --retry-failed, observatory --dry-run Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 20:52:56 +01:00
parent da2a989744
commit 757b781c67
33 changed files with 4253 additions and 170 deletions
--- a/src/webui/data.py
+++ b/src/webui/data.py
@@ -66,6 +66,7 @@ def get_drafts_page(
    min_score: float = 0.0,
    sort: str = "score",
    sort_dir: str = "desc",
+    source: str = "",
 ) -> dict:
    """Return a paginated, filtered list of drafts with ratings.

@@ -80,6 +81,8 @@ def get_drafts_page(
            continue
        if category and category not in rating.categories:
            continue
+        if source and draft.source != source:
+            continue
        if search:
            haystack = f"{draft.name} {draft.title} {rating.summary}".lower()
            if not all(w in haystack for w in search.lower().split()):
@@ -96,6 +99,9 @@ def get_drafts_page(
        "relevance": lambda p: p[1].relevance,
        "overlap": lambda p: p[1].overlap,
        "momentum": lambda p: p[1].momentum,
+        "readiness": lambda p: (1.0 if p[0].name.startswith("draft-ietf-") else 0.0) * 0.25 +
+                                min(int(p[0].rev or "0") / 5.0, 1.0) * 0.15 +
+                                ((p[1].momentum - 1) / 4.0) * 0.15,
    }
    key_fn = sort_keys.get(sort, sort_keys["score"])
    reverse = sort_dir == "desc"
@@ -107,15 +113,23 @@ def get_drafts_page(
    start = (page - 1) * per_page
    page_items = filtered[start : start + per_page]

+    # Pre-compute readiness for page items (lightweight version)
+    from ietf_analyzer.readiness import compute_readiness
+    readiness_cache = {}
+    for draft, rating in page_items:
+        readiness_cache[draft.name] = compute_readiness(db, draft.name)
+
    drafts = []
    for draft, rating in page_items:
+        r_score = readiness_cache.get(draft.name, {}).get("score", 0)
        drafts.append({
            "name": draft.name,
            "title": draft.title,
            "date": draft.date,
-            "url": draft.datatracker_url,
+            "url": draft.source_url if draft.source != "ietf" else draft.datatracker_url,
            "pages": draft.pages or 0,
            "group": draft.group or "individual",
+            "source": draft.source or "ietf",
            "score": round(rating.composite_score, 2),
            "novelty": rating.novelty,
            "maturity": rating.maturity,
@@ -124,6 +138,7 @@ def get_drafts_page(
            "relevance": rating.relevance,
            "categories": rating.categories,
            "summary": rating.summary,
+            "readiness": r_score,
        })

    return {
@@ -185,6 +200,14 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
            "categories": rating.categories,
        }

+    # Readiness score
+    from ietf_analyzer.readiness import compute_readiness
+    result["readiness"] = compute_readiness(db, name)
+
+    # Annotation
+    annotation = db.get_annotation(name)
+    result["annotation"] = annotation
+
    return result


@@ -253,8 +276,11 @@ def get_ideas_by_type(db: Database) -> dict:


 def get_all_gaps(db: Database) -> list[dict]:
-    """Return all gap analysis results."""
-    return db.all_gaps()
+    """Return all gap analysis results, sorted by severity (critical first)."""
+    _sev_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
+    gaps = db.all_gaps()
+    gaps.sort(key=lambda g: _sev_order.get(g.get("severity", "low"), 99))
+    return gaps


 def get_gap_detail(db: Database, gap_id: int) -> dict | None:
@@ -775,17 +801,252 @@ def get_monitor_status(db: Database) -> dict:
    """Return monitoring status data for dashboard."""
    runs = db.get_monitor_runs(limit=20)
    last = runs[0] if runs else None
+    total_drafts = db.count_drafts()
+    rated_count = len(db.drafts_with_ratings(limit=10000))
    unrated = len(db.unrated_drafts(limit=9999))
    unembedded = len(db.drafts_without_embeddings(limit=9999))
+    embedded_count = total_drafts - unembedded
    no_ideas = len(db.drafts_without_ideas(limit=9999))
+    ideas_count = total_drafts - no_ideas
+    idea_total = db.idea_count()
+    gap_count = len(db.all_gaps())
+    input_tok, output_tok = db.total_tokens_used()
+
+    # Estimate cost (Sonnet pricing: $3/M input, $15/M output)
+    est_cost = (input_tok * 3.0 / 1_000_000) + (output_tok * 15.0 / 1_000_000)
+
    return {
        "last_run": last,
        "runs": runs,
        "unprocessed": {"unrated": unrated, "unembedded": unembedded, "no_ideas": no_ideas},
        "total_runs": len(runs),
+        "pipeline": {
+            "total_drafts": total_drafts,
+            "rated": rated_count,
+            "embedded": embedded_count,
+            "with_ideas": ideas_count,
+            "idea_total": idea_total,
+            "gap_count": gap_count,
+        },
+        "cost": {
+            "input_tokens": input_tok,
+            "output_tokens": output_tok,
+            "estimated_usd": round(est_cost, 2),
+        },
    }


+def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
+    """Return citation network data for force-directed graph.
+
+    Returns {nodes: [{id, type, title, influence, ...}],
+             edges: [{source, target}],
+             stats: {node_count, edge_count, ...}}
+    """
+    # Get all references
+    rows = db.conn.execute(
+        "SELECT draft_name, ref_type, ref_id FROM draft_refs"
+    ).fetchall()
+
+    # Count in-degree for each referenced item
+    in_degree: dict[str, int] = Counter()
+    edges_raw = []
+    for r in rows:
+        ref_key = f"{r['ref_type']}:{r['ref_id']}"
+        in_degree[ref_key] += 1
+        edges_raw.append((r["draft_name"], ref_key))
+
+    # Also count drafts as source nodes
+    draft_out: dict[str, int] = Counter()
+    for draft_name, _ in edges_raw:
+        draft_out[draft_name] += 1
+
+    # Get draft titles for labeling
+    draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall()
+    draft_titles = {r["name"]: r["title"] for r in draft_rows}
+
+    # Get rating categories for draft coloring
+    rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall()
+    draft_cats = {}
+    for r in rating_rows:
+        try:
+            cats = json.loads(r["categories"]) if r["categories"] else []
+            draft_cats[r["draft_name"]] = cats[0] if cats else "Other"
+        except Exception:
+            draft_cats[r["draft_name"]] = "Other"
+
+    # Filter: keep RFCs with min_refs+ references and all drafts that reference them
+    top_refs = {k: v for k, v in in_degree.items() if v >= min_refs}
+
+    # Build node set
+    node_set = set()
+    filtered_edges = []
+    for draft_name, ref_key in edges_raw:
+        if ref_key in top_refs:
+            node_set.add(draft_name)
+            node_set.add(ref_key)
+            filtered_edges.append({"source": draft_name, "target": ref_key})
+
+    # Limit to ~200 nodes max for readability
+    if len(node_set) > 250:
+        # Keep only refs with higher in-degree
+        sorted_refs = sorted(top_refs.items(), key=lambda x: x[1], reverse=True)
+        keep_refs = set(k for k, _ in sorted_refs[:80])
+        node_set = set()
+        filtered_edges = []
+        for draft_name, ref_key in edges_raw:
+            if ref_key in keep_refs:
+                node_set.add(draft_name)
+                node_set.add(ref_key)
+                filtered_edges.append({"source": draft_name, "target": ref_key})
+
+    # Build nodes
+    nodes = []
+    for nid in node_set:
+        if ":" in nid and not nid.startswith("draft-"):
+            # It's a reference node (rfc:1234, bcp:14, etc.)
+            ref_type, ref_id = nid.split(":", 1)
+            influence = in_degree.get(nid, 0)
+            if ref_type == "rfc":
+                try:
+                    title = f"RFC {int(ref_id)}"
+                except ValueError:
+                    title = f"RFC {ref_id}"
+            else:
+                title = f"{ref_type.upper()} {ref_id}"
+            nodes.append({
+                "id": nid,
+                "type": ref_type,
+                "title": title,
+                "influence": influence,
+                "ref_id": ref_id,
+            })
+        else:
+            # It's a draft node
+            influence = in_degree.get(nid, 0) + draft_out.get(nid, 0)
+            nodes.append({
+                "id": nid,
+                "type": "draft",
+                "title": draft_titles.get(nid, nid),
+                "influence": draft_out.get(nid, 0),
+                "category": draft_cats.get(nid, "Other"),
+            })
+
+    # Stats
+    rfc_count = sum(1 for n in nodes if n["type"] == "rfc")
+    draft_count = sum(1 for n in nodes if n["type"] == "draft")
+
+    return {
+        "nodes": nodes,
+        "edges": filtered_edges,
+        "stats": {
+            "node_count": len(nodes),
+            "edge_count": len(filtered_edges),
+            "rfc_count": rfc_count,
+            "draft_count": draft_count,
+        },
+    }
+
+
+def global_search(db: Database, query: str) -> dict:
+    """Search across drafts (FTS5), ideas, authors, and gaps.
+
+    Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
+    """
+    results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []}
+    if not query or not query.strip():
+        return results
+
+    q = query.strip()
+
+    # 1. Drafts via FTS5
+    try:
+        fts_query = " ".join(f'"{w}"' for w in q.split() if w)
+        rows = db.conn.execute(
+            """SELECT d.name, d.title, d.abstract, d.time, d."group"
+            FROM drafts d
+            JOIN drafts_fts f ON d.rowid = f.rowid
+            WHERE drafts_fts MATCH ?
+            ORDER BY rank
+            LIMIT 50""",
+            (fts_query,),
+        ).fetchall()
+        for r in rows:
+            results["drafts"].append({
+                "name": r["name"],
+                "title": r["title"],
+                "abstract": (r["abstract"] or "")[:200],
+                "date": r["time"],
+                "group": r["group"] or "individual",
+            })
+    except Exception:
+        # FTS5 match can fail on certain query syntax; fall back to LIKE
+        like = f"%{q}%"
+        rows = db.conn.execute(
+            """SELECT name, title, abstract, time, "group" FROM drafts
+            WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ?
+            LIMIT 50""",
+            (like, like, like),
+        ).fetchall()
+        for r in rows:
+            results["drafts"].append({
+                "name": r["name"],
+                "title": r["title"],
+                "abstract": (r["abstract"] or "")[:200],
+                "date": r["time"],
+                "group": r["group"] or "individual",
+            })
+
+    # 2. Ideas via LIKE
+    like = f"%{q}%"
+    rows = db.conn.execute(
+        """SELECT id, title, description, idea_type, draft_name FROM ideas
+        WHERE title LIKE ? OR description LIKE ?
+        ORDER BY id LIMIT 50""",
+        (like, like),
+    ).fetchall()
+    for r in rows:
+        results["ideas"].append({
+            "id": r["id"],
+            "title": r["title"],
+            "description": (r["description"] or "")[:200],
+            "type": r["idea_type"],
+            "draft_name": r["draft_name"],
+        })
+
+    # 3. Authors via LIKE
+    rows = db.conn.execute(
+        """SELECT person_id, name, affiliation FROM authors
+        WHERE name LIKE ? OR affiliation LIKE ?
+        ORDER BY name LIMIT 50""",
+        (like, like),
+    ).fetchall()
+    for r in rows:
+        results["authors"].append({
+            "person_id": r["person_id"],
+            "name": r["name"],
+            "affiliation": r["affiliation"] or "",
+        })
+
+    # 4. Gaps via LIKE
+    rows = db.conn.execute(
+        """SELECT id, topic, description, category, severity FROM gaps
+        WHERE topic LIKE ? OR description LIKE ?
+        ORDER BY id LIMIT 50""",
+        (like, like),
+    ).fetchall()
+    for r in rows:
+        results["gaps"].append({
+            "id": r["id"],
+            "topic": r["topic"],
+            "description": (r["description"] or "")[:200],
+            "category": r["category"],
+            "severity": r["severity"],
+        })
+
+    return results
+
+
 def get_landscape_tsne(db: Database) -> list[dict]:
    """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].

@@ -829,3 +1090,116 @@ def get_landscape_tsne(db: Database) -> list[dict]:
            "score": round(r.composite_score, 2),
        })
    return result
+
+
+def get_comparison_data(db: Database, names: list[str]) -> dict | None:
+    """Get comparison data for a list of drafts.
+
+    Returns {
+        drafts: [{name, title, abstract, rating, ideas, refs, ...}],
+        shared_ideas: [{title, drafts: [name,...]}],
+        unique_ideas: {name: [{title, description}]},
+        shared_refs: [{type, id, drafts: [name,...]}],
+        unique_refs: {name: [{type, id}]},
+        similarities: [{a, b, similarity}],
+        comparison_text: str | None,
+    }
+    """
+    import numpy as np
+
+    drafts_data = []
+    all_ideas: dict[str, list[dict]] = {}
+    all_refs: dict[str, list[tuple[str, str]]] = {}
+
+    for name in names:
+        detail = get_draft_detail(db, name)
+        if not detail:
+            continue
+        drafts_data.append(detail)
+        all_ideas[name] = detail.get("ideas", [])
+        all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])]
+
+    if len(drafts_data) < 2:
+        return None
+
+    # Find shared vs unique ideas (by title similarity)
+    idea_title_drafts: dict[str, list[str]] = {}
+    for name, ideas in all_ideas.items():
+        for idea in ideas:
+            title_lower = idea["title"].lower().strip()
+            if title_lower not in idea_title_drafts:
+                idea_title_drafts[title_lower] = []
+            idea_title_drafts[title_lower].append(name)
+
+    shared_ideas = [
+        {"title": title, "drafts": draft_list}
+        for title, draft_list in idea_title_drafts.items()
+        if len(set(draft_list)) > 1
+    ]
+    unique_ideas: dict[str, list[dict]] = {}
+    for name, ideas in all_ideas.items():
+        unique = []
+        for idea in ideas:
+            title_lower = idea["title"].lower().strip()
+            if len(set(idea_title_drafts.get(title_lower, []))) <= 1:
+                unique.append({"title": idea["title"], "description": idea.get("description", "")})
+        unique_ideas[name] = unique
+
+    # Find shared vs unique references
+    ref_drafts: dict[tuple[str, str], list[str]] = {}
+    for name, refs in all_refs.items():
+        for ref in refs:
+            if ref not in ref_drafts:
+                ref_drafts[ref] = []
+            ref_drafts[ref].append(name)
+
+    shared_refs = [
+        {"type": ref[0], "id": ref[1], "drafts": draft_list}
+        for ref, draft_list in ref_drafts.items()
+        if len(set(draft_list)) > 1
+    ]
+    unique_refs: dict[str, list[dict]] = {}
+    for name, refs in all_refs.items():
+        unique = []
+        for ref in refs:
+            if len(set(ref_drafts.get(ref, []))) <= 1:
+                unique.append({"type": ref[0], "id": ref[1]})
+        unique_refs[name] = unique
+
+    # Pairwise embedding similarities
+    embeddings = db.all_embeddings()
+    similarities = []
+    valid_names = [d["name"] for d in drafts_data]
+    for i in range(len(valid_names)):
+        for j in range(i + 1, len(valid_names)):
+            a, b = valid_names[i], valid_names[j]
+            if a in embeddings and b in embeddings:
+                vec_a = embeddings[a]
+                vec_b = embeddings[b]
+                dot = np.dot(vec_a, vec_b)
+                norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
+                sim = float(dot / norm) if norm > 0 else 0.0
+                similarities.append({"a": a, "b": b, "similarity": round(sim, 4)})
+
+    return {
+        "drafts": drafts_data,
+        "shared_ideas": shared_ideas,
+        "unique_ideas": unique_ideas,
+        "shared_refs": shared_refs,
+        "unique_refs": unique_refs,
+        "similarities": similarities,
+        "comparison_text": None,
+    }
+
+
+def get_ask_data(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
+    """Run hybrid search + Claude synthesis for a question.
+
+    Returns {answer: str, sources: [{name, title, similarity, excerpt}]}.
+    """
+    from ietf_analyzer.config import Config
+    from ietf_analyzer.search import HybridSearch
+
+    config = Config.load()
+    searcher = HybridSearch(config, db)
+    return searcher.ask(question, top_k=top_k, cheap=cheap)