ietf-draft-analyzer/src/webui/data.py

"""Data access layer for the web dashboard.

Thin wrapper around ietf_analyzer.db.Database that returns plain dicts
ready for JSON serialization or Jinja2 template rendering.
"""

from __future__ import annotations

import json
import sys
from collections import Counter, defaultdict
from pathlib import Path

# Add project root to path so we can import ietf_analyzer
_project_root = Path(__file__).resolve().parent.parent.parent
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root / "src"))

from ietf_analyzer.config import Config
from ietf_analyzer.db import Database


def get_db() -> Database:
    """Get a Database instance using default config."""
    config = Config.load()
    return Database(config)


def get_overview_stats(db: Database) -> dict:
    """Return high-level stats for the dashboard home page."""
    total_drafts = db.count_drafts()
    rated_pairs = db.drafts_with_ratings(limit=1000)
    rated_count = len(rated_pairs)
    author_count = db.author_count()
    idea_count = db.idea_count()
    gaps = db.all_gaps()
    input_tok, output_tok = db.total_tokens_used()

    return {
        "total_drafts": total_drafts,
        "rated_count": rated_count,
        "author_count": author_count,
        "idea_count": idea_count,
        "gap_count": len(gaps),
        "input_tokens": input_tok,
        "output_tokens": output_tok,
    }


def get_category_counts(db: Database) -> dict[str, int]:
    """Return {category: draft_count} for all categories."""
    pairs = db.drafts_with_ratings(limit=1000)
    counts: dict[str, int] = Counter()
    for _, rating in pairs:
        for cat in rating.categories:
            counts[cat] += 1
    return dict(counts.most_common())


def get_drafts_page(
    db: Database,
    page: int = 1,
    per_page: int = 50,
    search: str = "",
    category: str = "",
    min_score: float = 0.0,
    sort: str = "score",
    sort_dir: str = "desc",
    source: str = "",
) -> dict:
    """Return a paginated, filtered list of drafts with ratings.

    Returns dict with keys: drafts, total, page, per_page, pages.
    """
    pairs = db.drafts_with_ratings(limit=1000)

    # Build author lookup for search (draft_name -> "author1 author2 ...")
    author_text_by_draft: dict[str, str] = {}
    if search:
        rows = db.conn.execute(
            """SELECT da.draft_name, GROUP_CONCAT(a.name, ' ') as names
               FROM draft_authors da JOIN authors a ON da.person_id = a.person_id
               GROUP BY da.draft_name"""
        ).fetchall()
        for r in rows:
            author_text_by_draft[r[0]] = r[1] or ""

    # Filter
    filtered = []
    for draft, rating in pairs:
        if min_score > 0 and rating.composite_score < min_score:
            continue
        if category and category not in rating.categories:
            continue
        if source and draft.source != source:
            continue
        if search:
            author_names = author_text_by_draft.get(draft.name, "")
            haystack = f"{draft.name} {draft.title} {rating.summary} {author_names}".lower()
            if not all(w in haystack for w in search.lower().split()):
                continue
        filtered.append((draft, rating))

    # Sort
    sort_keys = {
        "score": lambda p: p[1].composite_score,
        "name": lambda p: p[0].name,
        "date": lambda p: p[0].time or "",
        "novelty": lambda p: p[1].novelty,
        "maturity": lambda p: p[1].maturity,
        "relevance": lambda p: p[1].relevance,
        "overlap": lambda p: p[1].overlap,
        "momentum": lambda p: p[1].momentum,
        "readiness": lambda p: (1.0 if p[0].name.startswith("draft-ietf-") else 0.0) * 0.25 +
                                min(int(p[0].rev or "0") / 5.0, 1.0) * 0.15 +
                                ((p[1].momentum - 1) / 4.0) * 0.15,
    }
    key_fn = sort_keys.get(sort, sort_keys["score"])
    reverse = sort_dir == "desc"
    filtered.sort(key=key_fn, reverse=reverse)

    total = len(filtered)
    pages = max(1, (total + per_page - 1) // per_page)
    page = max(1, min(page, pages))
    start = (page - 1) * per_page
    page_items = filtered[start : start + per_page]

    # Pre-compute readiness for page items (lightweight version)
    from ietf_analyzer.readiness import compute_readiness
    readiness_cache = {}
    for draft, rating in page_items:
        readiness_cache[draft.name] = compute_readiness(db, draft.name)

    drafts = []
    for draft, rating in page_items:
        r_score = readiness_cache.get(draft.name, {}).get("score", 0)
        drafts.append({
            "name": draft.name,
            "title": draft.title,
            "date": draft.date,
            "url": draft.source_url if draft.source != "ietf" else draft.datatracker_url,
            "pages": draft.pages or 0,
            "group": draft.group or "individual",
            "source": draft.source or "ietf",
            "score": round(rating.composite_score, 2),
            "novelty": rating.novelty,
            "maturity": rating.maturity,
            "overlap": rating.overlap,
            "momentum": rating.momentum,
            "relevance": rating.relevance,
            "categories": rating.categories,
            "summary": rating.summary,
            "readiness": r_score,
        })

    return {
        "drafts": drafts,
        "total": total,
        "page": page,
        "per_page": per_page,
        "pages": pages,
    }


def get_draft_detail(db: Database, name: str) -> dict | None:
    """Return full detail for a single draft."""
    draft = db.get_draft(name)
    if not draft:
        return None

    rating = db.get_rating(name)
    authors = db.get_authors_for_draft(name)
    ideas = db.get_ideas_for_draft(name)
    refs = db.get_refs_for_draft(name)

    result = {
        "name": draft.name,
        "title": draft.title,
        "rev": draft.rev,
        "abstract": draft.abstract,
        "date": draft.date,
        "time": draft.time,
        "url": draft.datatracker_url,
        "text_url": draft.text_url,
        "pages": draft.pages,
        "words": draft.words,
        "group": draft.group or "individual",
        "categories": draft.categories,
        "tags": draft.tags,
        "authors": [
            {"name": a.name, "affiliation": a.affiliation, "person_id": a.person_id}
            for a in authors
        ],
        "ideas": ideas,
        "refs": [{"type": t, "id": rid} for t, rid in refs],
    }

    if rating:
        result["rating"] = {
            "score": round(rating.composite_score, 2),
            "novelty": rating.novelty,
            "maturity": rating.maturity,
            "overlap": rating.overlap,
            "momentum": rating.momentum,
            "relevance": rating.relevance,
            "summary": rating.summary,
            "novelty_note": rating.novelty_note,
            "maturity_note": rating.maturity_note,
            "overlap_note": rating.overlap_note,
            "momentum_note": rating.momentum_note,
            "relevance_note": rating.relevance_note,
            "categories": rating.categories,
        }

    # Readiness score
    from ietf_analyzer.readiness import compute_readiness
    result["readiness"] = compute_readiness(db, name)

    # Annotation
    annotation = db.get_annotation(name)
    result["annotation"] = annotation

    return result


def get_rating_distributions(db: Database) -> dict:
    """Return arrays for each rating dimension, suitable for Plotly."""
    pairs = db.drafts_with_ratings(limit=1000)
    dims = {
        "novelty": [],
        "maturity": [],
        "overlap": [],
        "momentum": [],
        "relevance": [],
        "scores": [],
        "categories": [],
        "names": [],
    }
    for draft, rating in pairs:
        dims["novelty"].append(rating.novelty)
        dims["maturity"].append(rating.maturity)
        dims["overlap"].append(rating.overlap)
        dims["momentum"].append(rating.momentum)
        dims["relevance"].append(rating.relevance)
        dims["scores"].append(round(rating.composite_score, 2))
        dims["categories"].append(rating.categories[0] if rating.categories else "Other")
        dims["names"].append(draft.name)
    return dims


def get_timeline_data(db: Database) -> dict:
    """Return monthly counts by category for timeline chart."""
    pairs = db.drafts_with_ratings(limit=1000)
    all_drafts = db.list_drafts(limit=1000, order_by="time ASC")
    rating_map = {d.name: r for d, r in pairs}

    month_cat: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
    for d in all_drafts:
        month = d.time[:7] if d.time else "unknown"
        r = rating_map.get(d.name)
        if r:
            cat = r.categories[0] if r.categories else "Other"
            month_cat[month][cat] += 1

    months = sorted(month_cat.keys())
    cat_totals: Counter = Counter()
    for mc in month_cat.values():
        for c, cnt in mc.items():
            cat_totals[c] += cnt
    top_cats = [c for c, _ in cat_totals.most_common(10)]

    series = {}
    for cat in top_cats:
        series[cat] = [month_cat[m].get(cat, 0) for m in months]

    return {"months": months, "series": series, "categories": top_cats}


def get_ideas_by_type(db: Database) -> dict:
    """Return ideas grouped by type with counts."""
    all_ideas = db.all_ideas()
    type_counts = Counter(i.get("type", "other") or "other" for i in all_ideas)
    return {
        "total": len(all_ideas),
        "by_type": dict(type_counts.most_common()),
        "ideas": all_ideas,
    }


def get_all_gaps(db: Database) -> list[dict]:
    """Return all gap analysis results, sorted by severity (critical first)."""
    _sev_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
    gaps = db.all_gaps()
    gaps.sort(key=lambda g: _sev_order.get(g.get("severity", "low"), 99))
    return gaps


def get_gap_detail(db: Database, gap_id: int) -> dict | None:
    """Return a single gap by ID, or None if not found."""
    gaps = db.all_gaps()
    for g in gaps:
        if g["id"] == gap_id:
            return g
    return None


def get_generated_drafts() -> list[dict]:
    """Return list of pre-generated draft files in data/reports/generated-drafts/."""
    drafts_dir = _project_root / "data" / "reports" / "generated-drafts"
    if not drafts_dir.exists():
        return []
    results = []
    for f in sorted(drafts_dir.glob("draft-*.txt")):
        # Extract title from first non-empty content line after header
        title = f.stem
        text = f.read_text(errors="replace")
        for line in text.splitlines():
            stripped = line.strip()
            if stripped and not stripped.startswith("Internet-Draft") and \
               not stripped.startswith("Intended status") and \
               not stripped.startswith("Expires:") and stripped != "":
                title = stripped
                break
        results.append({
            "filename": f.name,
            "stem": f.stem,
            "title": title,
            "size": f.stat().st_size,
            "path": str(f),
        })
    return results


def read_generated_draft(filename: str) -> str | None:
    """Read a generated draft file by filename. Returns text or None."""
    drafts_dir = _project_root / "data" / "reports" / "generated-drafts"
    path = drafts_dir / filename
    if not path.exists() or not path.is_file():
        return None
    # Safety: ensure we're not reading outside the directory
    if not str(path.resolve()).startswith(str(drafts_dir.resolve())):
        return None
    return path.read_text(errors="replace")


def get_top_authors(db: Database, limit: int = 30) -> list[dict]:
    """Return top authors by draft count."""
    rows = db.top_authors(limit=limit)
    return [
        {"name": name, "affiliation": aff, "draft_count": cnt, "drafts": drafts}
        for name, aff, cnt, drafts in rows
    ]


def get_org_data(db: Database, limit: int = 20) -> list[dict]:
    """Return organization contribution data."""
    rows = db.top_orgs(limit=limit)
    return [
        {"org": org, "author_count": authors, "draft_count": drafts}
        for org, authors, drafts in rows
    ]


def get_category_radar_data(db: Database) -> dict:
    """Return average rating profiles per category for radar chart."""
    pairs = db.drafts_with_ratings(limit=1000)
    cat_ratings: dict[str, list] = defaultdict(list)
    for _, r in pairs:
        for c in r.categories:
            cat_ratings[c].append(r)

    top_cats = sorted(cat_ratings.keys(), key=lambda c: len(cat_ratings[c]), reverse=True)[:8]
    result = {}
    for cat in top_cats:
        ratings = cat_ratings[cat]
        n = len(ratings)
        result[cat] = {
            "count": n,
            "novelty": round(sum(r.novelty for r in ratings) / n, 2),
            "maturity": round(sum(r.maturity for r in ratings) / n, 2),
            "relevance": round(sum(r.relevance for r in ratings) / n, 2),
            "momentum": round(sum(r.momentum for r in ratings) / n, 2),
            "low_overlap": round(sum(6 - r.overlap for r in ratings) / n, 2),
        }
    return result


def get_score_histogram(db: Database) -> list[float]:
    """Return list of composite scores for histogram."""
    pairs = db.drafts_with_ratings(limit=1000)
    return [round(r.composite_score, 2) for _, r in pairs]


def get_coauthor_network(db: Database, min_shared: int = 1) -> dict:
    """Return co-authorship network data for force-directed graph.

    Returns {nodes: [{id, name, org, draft_count}], edges: [{source, target, weight}]}
    """
    pairs = db.coauthor_pairs()
    top = db.top_authors(limit=100)

    # Build node set from authors who have co-authorships
    author_info = {name: {"org": aff, "draft_count": cnt} for name, aff, cnt, _ in top}
    node_set = set()
    edges = []
    for a, b, shared in pairs:
        if shared >= min_shared:
            node_set.add(a)
            node_set.add(b)
            edges.append({"source": a, "target": b, "weight": shared})

    nodes = []
    for name in node_set:
        info = author_info.get(name, {"org": "", "draft_count": 1})
        nodes.append({
            "id": name,
            "name": name,
            "org": info["org"],
            "draft_count": info["draft_count"],
        })

    return {"nodes": nodes, "edges": edges}


def get_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
    """Return draft similarity network for force-directed graph.

    Returns {nodes: [{name, title, category, score}],
             edges: [{source, target, similarity}],
             stats: {node_count, edge_count, avg_similarity}}
    """
    import numpy as np

    embeddings = db.all_embeddings()
    if len(embeddings) < 2:
        return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}}

    pairs = db.drafts_with_ratings(limit=1000)
    rating_map = {d.name: r for d, r in pairs}
    draft_map = {d.name: d for d, _ in pairs}

    # Filter to drafts with both embeddings and ratings
    names = [n for n in embeddings if n in rating_map]
    if len(names) < 2:
        return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}}

    matrix = np.array([embeddings[n] for n in names])

    # L2-normalize and compute cosine similarity
    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    normalized = matrix / norms
    sim_matrix = normalized @ normalized.T

    # Find pairs above threshold (upper triangle only)
    edges = []
    node_set = set()
    for i in range(len(names)):
        for j in range(i + 1, len(names)):
            sim = float(sim_matrix[i, j])
            if sim >= threshold:
                edges.append({"source": names[i], "target": names[j], "similarity": round(sim, 4)})
                node_set.add(names[i])
                node_set.add(names[j])

    # Build nodes from connected drafts only
    nodes = []
    for name in names:
        if name not in node_set:
            continue
        r = rating_map[name]
        d = draft_map.get(name)
        nodes.append({
            "name": name,
            "title": d.title if d else name,
            "category": r.categories[0] if r.categories else "Other",
            "score": round(r.composite_score, 2),
        })

    avg_sim = round(sum(e["similarity"] for e in edges) / max(len(edges), 1), 4)

    return {
        "nodes": nodes,
        "edges": edges,
        "stats": {"node_count": len(nodes), "edge_count": len(edges), "avg_similarity": avg_sim},
    }


def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]:
    """Return cross-org collaboration pairs."""
    rows = db.cross_org_collaborations(limit=limit)
    return [
        {"org_a": a, "org_b": b, "shared_drafts": cnt}
        for a, b, cnt in rows
    ]


def get_author_network_full(db: Database) -> dict:
    """Return enriched co-authorship network with avg scores and cluster info.

    Returns {
        nodes: [{id, name, org, draft_count, avg_score, drafts: [name,...]}],
        edges: [{source, target, weight}],
        clusters: [{id, members: [name,...], org_mix: {org: count}, size}],
    }
    """
    pairs = db.coauthor_pairs()
    top = db.top_authors(limit=500)

    # Build rating lookup for avg scores
    rated = db.drafts_with_ratings(limit=2000)
    draft_score = {d.name: r.composite_score for d, r in rated}

    # Author info map
    author_info = {}
    for name, aff, cnt, drafts in top:
        scores = [draft_score[dn] for dn in drafts if dn in draft_score]
        avg = round(sum(scores) / len(scores), 2) if scores else 0
        author_info[name] = {
            "org": aff, "draft_count": cnt, "drafts": drafts, "avg_score": avg
        }

    # Build node set: authors with meaningful collaboration (2+ shared drafts)
    node_set = set()
    edges = []
    for a, b, shared in pairs:
        if shared >= 2:
            node_set.add(a)
            node_set.add(b)
            edges.append({"source": a, "target": b, "weight": shared})

    # Also include authors with 3+ drafts even if no co-authorships
    for name, info in author_info.items():
        if info["draft_count"] >= 3:
            node_set.add(name)

    nodes = []
    for name in node_set:
        info = author_info.get(name, {"org": "", "draft_count": 1, "drafts": [], "avg_score": 0})
        nodes.append({
            "id": name,
            "name": name,
            "org": info["org"],
            "draft_count": info["draft_count"],
            "avg_score": info["avg_score"],
            "drafts": info["drafts"][:8],  # cap for JSON size
        })

    # Cluster detection via connected components (BFS)
    adjacency: dict[str, set[str]] = defaultdict(set)
    for e in edges:
        adjacency[e["source"]].add(e["target"])
        adjacency[e["target"]].add(e["source"])

    visited: set[str] = set()
    clusters = []

    for node in sorted(node_set):
        if node in visited:
            continue
        component: list[str] = []
        queue = [node]
        while queue:
            current = queue.pop(0)
            if current in visited:
                continue
            visited.add(current)
            component.append(current)
            for neighbor in adjacency.get(current, []):
                if neighbor not in visited:
                    queue.append(neighbor)

        if len(component) >= 2:
            org_mix: dict[str, int] = Counter()
            cluster_drafts: dict[str, str] = {}  # name -> title
            for m in component:
                org = author_info.get(m, {}).get("org", "")
                if org:
                    org_mix[org] += 1
                for dn in author_info.get(m, {}).get("drafts", []):
                    if dn not in cluster_drafts:
                        d = db.get_draft(dn)
                        cluster_drafts[dn] = d.title[:80] if d else dn
            clusters.append({
                "id": len(clusters),
                "members": component,
                "org_mix": dict(org_mix.most_common()),
                "size": len(component),
                "drafts": [{"name": n, "title": t} for n, t in list(cluster_drafts.items())[:15]],
                "draft_count": len(cluster_drafts),
            })

    clusters.sort(key=lambda c: c["size"], reverse=True)

    return {"nodes": nodes, "edges": edges, "clusters": clusters}


def get_idea_clusters(db: Database) -> dict:
    """Cluster ideas by embedding similarity, return clusters + t-SNE scatter.

    Uses Ward linkage on L2-normalized embeddings (approximates cosine) with
    a target of ~30 clusters for readable groupings.  Enriches each cluster
    with WG info and category breakdown.
    """
    import json as _json
    import numpy as np
    from sklearn.preprocessing import normalize as sk_normalize

    embeddings = db.all_idea_embeddings()
    if not embeddings:
        return {"clusters": [], "scatter": [], "stats": {"total": 0, "clustered": 0, "num_clusters": 0}, "empty": True}

    # Fetch ideas with IDs for metadata lookup
    rows = db.conn.execute("SELECT id, title, description, idea_type, draft_name FROM ideas").fetchall()
    idea_map = {r["id"]: {"title": r["title"], "description": r["description"],
                           "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows}

    # Draft -> WG and category lookup
    draft_rows = db.conn.execute('SELECT name, "group", title FROM drafts').fetchall()
    draft_wg = {r["name"]: r["group"] or "none" for r in draft_rows}
    draft_title_map = {r["name"]: r["title"] for r in draft_rows}
    rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall()
    draft_cats: dict[str, list[str]] = {}
    for r in rating_rows:
        try:
            draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
        except (_json.JSONDecodeError, TypeError):
            draft_cats[r["draft_name"]] = []

    # Build matrix from embeddings that have matching ideas
    idea_ids = [iid for iid in embeddings if iid in idea_map]
    if len(idea_ids) < 5:
        return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True}

    matrix = np.array([embeddings[iid] for iid in idea_ids])
    matrix_norm = sk_normalize(matrix)

    # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
    n_target = max(10, min(40, len(idea_ids) // 12))
    try:
        from sklearn.cluster import AgglomerativeClustering
        clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
        labels = clustering.fit_predict(matrix_norm)
    except Exception:
        return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True}

    # Build cluster data
    cluster_ideas_map: dict[int, list] = defaultdict(list)
    for idx, iid in enumerate(idea_ids):
        cluster_ideas_map[labels[idx]].append(iid)

    stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with",
            "on", "by", "is", "as", "at", "from", "that", "this", "it",
            "based", "using", "protocol", "mechanism", "framework", "system",
            "network", "agent", "agents"}
    clusters = []
    for cid in sorted(cluster_ideas_map.keys()):
        members = cluster_ideas_map[cid]
        ideas_in_cluster = [idea_map[iid] for iid in members if iid in idea_map]
        if len(ideas_in_cluster) < 2:
            continue

        # Theme: most common significant words in titles
        words = Counter()
        for idea in ideas_in_cluster:
            for w in idea["title"].lower().split():
                w_clean = w.strip("()[].,;:-\"'")
                if len(w_clean) > 2 and w_clean not in stop:
                    words[w_clean] += 1
        top_words = [w for w, _ in words.most_common(4)]
        theme = " ".join(top_words).title() if top_words else f"Cluster {cid}"

        drafts = list({idea["draft_name"] for idea in ideas_in_cluster})

        # Enrich: WG breakdown
        wg_counts: dict[str, int] = Counter()
        cat_counts: dict[str, int] = Counter()
        for dname in drafts:
            wg = draft_wg.get(dname, "none")
            wg_counts[wg] += 1
            for cat in draft_cats.get(dname, []):
                cat_counts[cat] += 1

        wg_list = [{"wg": wg, "count": cnt} for wg, cnt in wg_counts.most_common(5)]
        cat_list = [{"cat": cat, "count": cnt} for cat, cnt in cat_counts.most_common(3)]
        cross_wg = len([w for w in wg_counts if w != "none"]) >= 2

        clusters.append({
            "id": len(clusters),
            "theme": theme,
            "size": len(ideas_in_cluster),
            "ideas": ideas_in_cluster[:20],
            "drafts": drafts,
            "wgs": wg_list,
            "categories": cat_list,
            "cross_wg": cross_wg,
            "wg_count": len(wg_counts),
        })

    clusters.sort(key=lambda c: c["size"], reverse=True)

    # Build mapping: original cluster label -> sorted index
    # Each cluster remembers which original label it came from via its member ids
    old_label_to_new: dict[int, int] = {}
    for new_idx, c in enumerate(clusters):
        c["id"] = new_idx
        # Find original label for any member of this cluster
        for old_cid, members in cluster_ideas_map.items():
            if members and members[0] in [iid for iid in members if iid in idea_map]:
                member_titles = {idea_map[m]["title"] for m in members if m in idea_map}
                c_titles = {idea["title"] for idea in c["ideas"]}
                if member_titles == c_titles or (member_titles & c_titles and len(members) == c["size"]):
                    old_label_to_new[old_cid] = new_idx
                    break

    # Fallback: build from idea_id -> label mapping
    iid_to_new: dict[int, int] = {}
    for old_cid, members in cluster_ideas_map.items():
        new_idx = old_label_to_new.get(old_cid, old_cid)
        for iid in members:
            iid_to_new[iid] = new_idx

    # t-SNE for scatter
    scatter = []
    try:
        from sklearn.manifold import TSNE
        perp = min(30, len(idea_ids) - 1)
        tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix_norm)

        for idx, iid in enumerate(idea_ids):
            info = idea_map.get(iid, {})
            scatter.append({
                "x": round(float(coords[idx, 0]), 3),
                "y": round(float(coords[idx, 1]), 3),
                "cluster_id": iid_to_new.get(iid, int(labels[idx])),
                "title": info.get("title", ""),
                "draft_name": info.get("draft_name", ""),
                "wg": draft_wg.get(info.get("draft_name", ""), ""),
            })
    except Exception:
        pass

    total = len(idea_ids)
    clustered = sum(c["size"] for c in clusters)
    return {
        "clusters": clusters,
        "scatter": scatter,
        "stats": {"total": total, "clustered": clustered, "num_clusters": len(clusters)},
        "empty": False,
    }


def get_timeline_animation_data(db: Database) -> dict:
    """Compute t-SNE on all drafts, return points with month info + category_monthly.

    t-SNE is computed once on ALL drafts so coordinates are stable across
    animation frames.  Each point carries a ``month`` field (YYYY-MM) so the
    front-end can build cumulative animation frames.
    """
    import numpy as np

    embeddings = db.all_embeddings()
    if len(embeddings) < 5:
        return {"points": [], "months": [], "category_monthly": {}}

    pairs = db.drafts_with_ratings(limit=1000)
    rating_map = {d.name: r for d, r in pairs}
    draft_map = {d.name: d for d, _ in pairs}

    # Filter to drafts that have both embeddings and ratings
    names = [n for n in embeddings if n in rating_map]
    if len(names) < 5:
        return {"points": [], "months": [], "category_monthly": {}}

    matrix = np.array([embeddings[n] for n in names])

    try:
        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
                     random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix)
    except Exception:
        return {"points": [], "months": [], "category_monthly": {}}

    # Build points with month
    points = []
    month_set: set[str] = set()
    category_monthly: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))

    for i, name in enumerate(names):
        r = rating_map[name]
        d = draft_map.get(name)
        month = (d.time[:7] if d and d.time else "unknown")
        cat = r.categories[0] if r.categories else "Other"
        month_set.add(month)
        category_monthly[month][cat] += 1
        points.append({
            "name": name,
            "title": d.title if d else name,
            "x": round(float(coords[i, 0]), 3),
            "y": round(float(coords[i, 1]), 3),
            "category": cat,
            "score": round(r.composite_score, 2),
            "month": month,
        })

    months = sorted(month_set)
    # Convert defaultdict to plain dict for JSON
    cat_monthly_plain = {m: dict(cats) for m, cats in category_monthly.items()}

    return {
        "points": points,
        "months": months,
        "category_monthly": cat_monthly_plain,
    }


def get_monitor_status(db: Database) -> dict:
    """Return monitoring status data for dashboard."""
    runs = db.get_monitor_runs(limit=20)
    last = runs[0] if runs else None
    total_drafts = db.count_drafts()
    rated_count = len(db.drafts_with_ratings(limit=10000))
    unrated = len(db.unrated_drafts(limit=9999))
    unembedded = len(db.drafts_without_embeddings(limit=9999))
    embedded_count = total_drafts - unembedded
    no_ideas = len(db.drafts_without_ideas(limit=9999))
    ideas_count = total_drafts - no_ideas
    idea_total = db.idea_count()
    gap_count = len(db.all_gaps())
    input_tok, output_tok = db.total_tokens_used()

    # Estimate cost (Sonnet pricing: $3/M input, $15/M output)
    est_cost = (input_tok * 3.0 / 1_000_000) + (output_tok * 15.0 / 1_000_000)

    return {
        "last_run": last,
        "runs": runs,
        "unprocessed": {"unrated": unrated, "unembedded": unembedded, "no_ideas": no_ideas},
        "total_runs": len(runs),
        "pipeline": {
            "total_drafts": total_drafts,
            "rated": rated_count,
            "embedded": embedded_count,
            "with_ideas": ideas_count,
            "idea_total": idea_total,
            "gap_count": gap_count,
        },
        "cost": {
            "input_tokens": input_tok,
            "output_tokens": output_tok,
            "estimated_usd": round(est_cost, 2),
        },
    }


def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
    """Return citation network data for force-directed graph.

    Returns {nodes: [{id, type, title, influence, ...}],
             edges: [{source, target}],
             stats: {node_count, edge_count, ...}}
    """
    # Get all references
    rows = db.conn.execute(
        "SELECT draft_name, ref_type, ref_id FROM draft_refs"
    ).fetchall()

    # Count in-degree for each referenced item
    in_degree: dict[str, int] = Counter()
    edges_raw = []
    for r in rows:
        ref_key = f"{r['ref_type']}:{r['ref_id']}"
        in_degree[ref_key] += 1
        edges_raw.append((r["draft_name"], ref_key))

    # Also count drafts as source nodes
    draft_out: dict[str, int] = Counter()
    for draft_name, _ in edges_raw:
        draft_out[draft_name] += 1

    # Get draft titles for labeling
    draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall()
    draft_titles = {r["name"]: r["title"] for r in draft_rows}

    # Get rating categories for draft coloring
    rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall()
    draft_cats = {}
    for r in rating_rows:
        try:
            cats = json.loads(r["categories"]) if r["categories"] else []
            draft_cats[r["draft_name"]] = cats[0] if cats else "Other"
        except Exception:
            draft_cats[r["draft_name"]] = "Other"

    # Filter: keep RFCs with min_refs+ references and all drafts that reference them
    top_refs = {k: v for k, v in in_degree.items() if v >= min_refs}

    # Build node set
    node_set = set()
    filtered_edges = []
    for draft_name, ref_key in edges_raw:
        if ref_key in top_refs:
            node_set.add(draft_name)
            node_set.add(ref_key)
            filtered_edges.append({"source": draft_name, "target": ref_key})

    # Limit to ~200 nodes max for readability
    if len(node_set) > 250:
        # Keep only refs with higher in-degree
        sorted_refs = sorted(top_refs.items(), key=lambda x: x[1], reverse=True)
        keep_refs = set(k for k, _ in sorted_refs[:80])
        node_set = set()
        filtered_edges = []
        for draft_name, ref_key in edges_raw:
            if ref_key in keep_refs:
                node_set.add(draft_name)
                node_set.add(ref_key)
                filtered_edges.append({"source": draft_name, "target": ref_key})

    # Build nodes
    nodes = []
    for nid in node_set:
        if ":" in nid and not nid.startswith("draft-"):
            # It's a reference node (rfc:1234, bcp:14, etc.)
            ref_type, ref_id = nid.split(":", 1)
            influence = in_degree.get(nid, 0)
            if ref_type == "rfc":
                try:
                    title = f"RFC {int(ref_id)}"
                except ValueError:
                    title = f"RFC {ref_id}"
            else:
                title = f"{ref_type.upper()} {ref_id}"
            nodes.append({
                "id": nid,
                "type": ref_type,
                "title": title,
                "influence": influence,
                "ref_id": ref_id,
            })
        else:
            # It's a draft node
            influence = in_degree.get(nid, 0) + draft_out.get(nid, 0)
            nodes.append({
                "id": nid,
                "type": "draft",
                "title": draft_titles.get(nid, nid),
                "influence": draft_out.get(nid, 0),
                "category": draft_cats.get(nid, "Other"),
            })

    # Stats
    rfc_count = sum(1 for n in nodes if n["type"] == "rfc")
    draft_count = sum(1 for n in nodes if n["type"] == "draft")

    return {
        "nodes": nodes,
        "edges": filtered_edges,
        "stats": {
            "node_count": len(nodes),
            "edge_count": len(filtered_edges),
            "rfc_count": rfc_count,
            "draft_count": draft_count,
        },
    }


def global_search(db: Database, query: str) -> dict:
    """Search across drafts (FTS5), ideas, authors, and gaps.

    Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
    """
    results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []}
    if not query or not query.strip():
        return results

    q = query.strip()

    # 1. Drafts via FTS5
    try:
        fts_query = " ".join(f'"{w}"' for w in q.split() if w)
        rows = db.conn.execute(
            """SELECT d.name, d.title, d.abstract, d.time, d."group"
            FROM drafts d
            JOIN drafts_fts f ON d.rowid = f.rowid
            WHERE drafts_fts MATCH ?
            ORDER BY rank
            LIMIT 50""",
            (fts_query,),
        ).fetchall()
        for r in rows:
            results["drafts"].append({
                "name": r["name"],
                "title": r["title"],
                "abstract": (r["abstract"] or "")[:200],
                "date": r["time"],
                "group": r["group"] or "individual",
            })
    except Exception:
        # FTS5 match can fail on certain query syntax; fall back to LIKE
        like = f"%{q}%"
        rows = db.conn.execute(
            """SELECT name, title, abstract, time, "group" FROM drafts
            WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ?
            LIMIT 50""",
            (like, like, like),
        ).fetchall()
        for r in rows:
            results["drafts"].append({
                "name": r["name"],
                "title": r["title"],
                "abstract": (r["abstract"] or "")[:200],
                "date": r["time"],
                "group": r["group"] or "individual",
            })

    # 2. Ideas via LIKE
    like = f"%{q}%"
    rows = db.conn.execute(
        """SELECT id, title, description, idea_type, draft_name FROM ideas
        WHERE title LIKE ? OR description LIKE ?
        ORDER BY id LIMIT 50""",
        (like, like),
    ).fetchall()
    for r in rows:
        results["ideas"].append({
            "id": r["id"],
            "title": r["title"],
            "description": (r["description"] or "")[:200],
            "type": r["idea_type"],
            "draft_name": r["draft_name"],
        })

    # 3. Authors via LIKE
    rows = db.conn.execute(
        """SELECT person_id, name, affiliation FROM authors
        WHERE name LIKE ? OR affiliation LIKE ?
        ORDER BY name LIMIT 50""",
        (like, like),
    ).fetchall()
    for r in rows:
        results["authors"].append({
            "person_id": r["person_id"],
            "name": r["name"],
            "affiliation": r["affiliation"] or "",
        })

    # 4. Gaps via LIKE
    rows = db.conn.execute(
        """SELECT id, topic, description, category, severity FROM gaps
        WHERE topic LIKE ? OR description LIKE ?
        ORDER BY id LIMIT 50""",
        (like, like),
    ).fetchall()
    for r in rows:
        results["gaps"].append({
            "id": r["id"],
            "topic": r["topic"],
            "description": (r["description"] or "")[:200],
            "category": r["category"],
            "severity": r["severity"],
        })

    return results


def get_landscape_tsne(db: Database) -> list[dict]:
    """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].

    Uses cached coordinates if available, otherwise computes fresh.
    """
    import numpy as np

    embeddings = db.all_embeddings()
    if len(embeddings) < 5:
        return []

    pairs = db.drafts_with_ratings(limit=1000)
    rating_map = {d.name: r for d, r in pairs}
    draft_map = {d.name: d for d, _ in pairs}

    # Filter to drafts that have both embeddings and ratings
    names = [n for n in embeddings if n in rating_map]
    if len(names) < 5:
        return []

    matrix = np.array([embeddings[n] for n in names])

    try:
        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
                     random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix)
    except Exception:
        return []

    result = []
    for i, name in enumerate(names):
        r = rating_map[name]
        d = draft_map.get(name)
        result.append({
            "name": name,
            "title": d.title if d else name,
            "x": round(float(coords[i, 0]), 3),
            "y": round(float(coords[i, 1]), 3),
            "category": r.categories[0] if r.categories else "Other",
            "score": round(r.composite_score, 2),
        })
    return result


def get_comparison_data(db: Database, names: list[str]) -> dict | None:
    """Get comparison data for a list of drafts.

    Returns {
        drafts: [{name, title, abstract, rating, ideas, refs, ...}],
        shared_ideas: [{title, drafts: [name,...]}],
        unique_ideas: {name: [{title, description}]},
        shared_refs: [{type, id, drafts: [name,...]}],
        unique_refs: {name: [{type, id}]},
        similarities: [{a, b, similarity}],
        comparison_text: str | None,
    }
    """
    import numpy as np

    drafts_data = []
    all_ideas: dict[str, list[dict]] = {}
    all_refs: dict[str, list[tuple[str, str]]] = {}

    for name in names:
        detail = get_draft_detail(db, name)
        if not detail:
            continue
        drafts_data.append(detail)
        all_ideas[name] = detail.get("ideas", [])
        all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])]

    if len(drafts_data) < 2:
        return None

    # Find shared vs unique ideas (by title similarity)
    idea_title_drafts: dict[str, list[str]] = {}
    for name, ideas in all_ideas.items():
        for idea in ideas:
            title_lower = idea["title"].lower().strip()
            if title_lower not in idea_title_drafts:
                idea_title_drafts[title_lower] = []
            idea_title_drafts[title_lower].append(name)

    shared_ideas = [
        {"title": title, "drafts": draft_list}
        for title, draft_list in idea_title_drafts.items()
        if len(set(draft_list)) > 1
    ]
    unique_ideas: dict[str, list[dict]] = {}
    for name, ideas in all_ideas.items():
        unique = []
        for idea in ideas:
            title_lower = idea["title"].lower().strip()
            if len(set(idea_title_drafts.get(title_lower, []))) <= 1:
                unique.append({"title": idea["title"], "description": idea.get("description", "")})
        unique_ideas[name] = unique

    # Find shared vs unique references
    ref_drafts: dict[tuple[str, str], list[str]] = {}
    for name, refs in all_refs.items():
        for ref in refs:
            if ref not in ref_drafts:
                ref_drafts[ref] = []
            ref_drafts[ref].append(name)

    shared_refs = [
        {"type": ref[0], "id": ref[1], "drafts": draft_list}
        for ref, draft_list in ref_drafts.items()
        if len(set(draft_list)) > 1
    ]
    unique_refs: dict[str, list[dict]] = {}
    for name, refs in all_refs.items():
        unique = []
        for ref in refs:
            if len(set(ref_drafts.get(ref, []))) <= 1:
                unique.append({"type": ref[0], "id": ref[1]})
        unique_refs[name] = unique

    # Pairwise embedding similarities
    embeddings = db.all_embeddings()
    similarities = []
    valid_names = [d["name"] for d in drafts_data]
    for i in range(len(valid_names)):
        for j in range(i + 1, len(valid_names)):
            a, b = valid_names[i], valid_names[j]
            if a in embeddings and b in embeddings:
                vec_a = embeddings[a]
                vec_b = embeddings[b]
                dot = np.dot(vec_a, vec_b)
                norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
                sim = float(dot / norm) if norm > 0 else 0.0
                similarities.append({"a": a, "b": b, "similarity": round(sim, 4)})

    return {
        "drafts": drafts_data,
        "shared_ideas": shared_ideas,
        "unique_ideas": unique_ideas,
        "shared_refs": shared_refs,
        "unique_refs": unique_refs,
        "similarities": similarities,
        "comparison_text": None,
    }


def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
    """Search-only (free) — returns sources + cached answer if available."""
    from ietf_analyzer.config import Config
    from ietf_analyzer.search import HybridSearch

    config = Config.load()
    searcher = HybridSearch(config, db)
    return searcher.search_only(question, top_k=top_k)


def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
    """Run Claude synthesis (costs tokens, result is cached permanently)."""
    from ietf_analyzer.config import Config
    from ietf_analyzer.search import HybridSearch

    config = Config.load()
    searcher = HybridSearch(config, db)
    return searcher.ask(question, top_k=top_k, cheap=cheap)