Complete remaining medium/low issues: performance, CLI, types, CI, tests

Performance: - Batch readiness computation (~200 queries → ~6 per page) - Batch draft lookup in author network (N+1 → single query) - File-based similarity matrix cache (.npy + metadata sidecar) - 5-minute TTL embedding cache for search queries CLI quality: - Add pass_cfg_db decorator, convert ~30 commands to shared config/db lifecycle - Add --dry-run to analyze, embed, embed-ideas, ideas, gaps commands - Move 15+ in-function imports to top of data.py Types & documentation: - Add 16 TypedDicts to data.py, annotate 12 function return types - Add ethics section to Post 06 (premature standardization, power asymmetry) - Add EU AI Act Article 43 conformity mapping to Post 06 - Add NIS2 and CRA references to Post 04 CI & testing: - Add GitHub Actions CI workflow (Python 3.11+3.12, ruff, pytest) - Add API documentation for all 20 endpoints (data/reports/api-docs.md) - Add 41 new tests (test_analyzer.py, test_search.py) — 64 total pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 14:06:54 +01:00
parent e7527ad68e
commit 20c45a7eba
14 changed files with 2305 additions and 1238 deletions
--- a/src/ietf_analyzer/cli.py
+++ b/src/ietf_analyzer/cli.py
--- a/src/ietf_analyzer/config.py
+++ b/src/ietf_analyzer/config.py
@@ -13,16 +13,15 @@ CONFIG_FILE = DEFAULT_DATA_DIR / "config.json"
 DEFAULT_KEYWORDS = [
    "agent",
    "ai-agent",
-    "llm",
-    "autonomous",
-    "machine-learning",
-    "artificial-intelligence",
-    "mcp",
    "agentic",
+    "autonomous",
+    "mcp",
    "inference",
    "generative",
    "intelligent",
-    "aipref",
+    "large language model",
+    "multi-agent",
+    "trustworth",
 ]

 # Environment variable overrides (env var name -> config field name)
@@ -39,6 +38,7 @@ class Config:
    db_path: str = str(DEFAULT_DATA_DIR / "drafts.db")
    ollama_url: str = "http://localhost:11434"
    ollama_embed_model: str = "nomic-embed-text"
+    ollama_classify_model: str = "llama3.2"
    claude_model: str = "claude-sonnet-4-20250514"
    claude_model_cheap: str = "claude-haiku-4-5-20251001"
    search_keywords: list[str] = field(default_factory=lambda: list(DEFAULT_KEYWORDS))
--- a/src/ietf_analyzer/db.py
+++ b/src/ietf_analyzer/db.py
@@ -326,6 +326,23 @@ class Database:
            return None
        return self._row_to_draft(row)

+    def get_drafts_by_names(self, names: list[str]) -> dict[str, "Draft"]:
+        """Batch-fetch drafts by name. Returns {name: Draft} dict."""
+        if not names:
+            return {}
+        result = {}
+        # SQLite has a variable limit (~999), so chunk if needed
+        for i in range(0, len(names), 900):
+            chunk = names[i : i + 900]
+            placeholders = ",".join("?" for _ in chunk)
+            rows = self.conn.execute(
+                f"SELECT * FROM drafts WHERE name IN ({placeholders})", chunk
+            ).fetchall()
+            for r in rows:
+                d = self._row_to_draft(r)
+                result[d.name] = d
+        return result
+
    def list_drafts(
        self,
        limit: int = 100,
--- a/src/ietf_analyzer/embeddings.py
+++ b/src/ietf_analyzer/embeddings.py
@@ -2,6 +2,10 @@

 from __future__ import annotations

+import hashlib
+import json
+from pathlib import Path
+
 import numpy as np
 import ollama as ollama_lib
 from rich.console import Console
@@ -111,16 +115,49 @@ class Embedder:
        return similarities[:top_n]

    def similarity_matrix(self) -> tuple[list[str], np.ndarray]:
-        """Compute pairwise similarity matrix for all embedded drafts."""
+        """Compute pairwise similarity matrix for all embedded drafts.
+
+        Uses a file-based cache keyed by the hash of embedding draft names.
+        If the set of embedded drafts hasn't changed, the cached matrix is
+        reloaded from disk instead of recomputing O(n^2) cosine similarities.
+        """
        all_embeddings = self.db.all_embeddings()
        names = sorted(all_embeddings.keys())
        n = len(names)
+
+        # Build cache key from sorted draft names
+        names_hash = hashlib.sha256("\n".join(names).encode()).hexdigest()[:16]
+        cache_dir = Path(self.config.db_path).parent / ".cache"
+        cache_meta = cache_dir / f"sim_matrix_{names_hash}.json"
+        cache_npy = cache_dir / f"sim_matrix_{names_hash}.npy"
+
+        # Try loading from cache
+        if cache_meta.exists() and cache_npy.exists():
+            try:
+                cached_names = json.loads(cache_meta.read_text())
+                if cached_names == names:
+                    matrix = np.load(cache_npy)
+                    if matrix.shape == (n, n):
+                        return names, matrix
+            except Exception:
+                pass  # Cache corrupted, recompute
+
+        # Compute fresh
        matrix = np.zeros((n, n), dtype=np.float32)
        for i in range(n):
            for j in range(i, n):
                sim = _cosine_similarity(all_embeddings[names[i]], all_embeddings[names[j]])
                matrix[i, j] = sim
                matrix[j, i] = sim
+
+        # Save to cache
+        try:
+            cache_dir.mkdir(exist_ok=True)
+            np.save(cache_npy, matrix)
+            cache_meta.write_text(json.dumps(names))
+        except Exception:
+            pass  # Non-fatal if caching fails
+
        return names, matrix

    def find_clusters(self, threshold: float = 0.85) -> list[list[str]]:
--- a/src/ietf_analyzer/readiness.py
+++ b/src/ietf_analyzer/readiness.py
@@ -100,3 +100,136 @@ def compute_readiness(db, draft_name: str) -> dict:
        f["contribution"] = round(f["value"] * f["weight"] * 100, 1)

    return {"score": score, "factors": factors}
+
+
+def compute_readiness_batch(db, draft_names: list[str]) -> dict[str, dict]:
+    """Batch-compute readiness for multiple drafts using bulk queries.
+
+    Returns {draft_name: {score, factors}} — same format as compute_readiness.
+    Reduces ~6 queries per draft to ~6 queries total.
+    """
+    if not draft_names:
+        return {}
+
+    # Batch-load drafts
+    drafts_map = db.get_drafts_by_names(draft_names)
+
+    # Batch-load ref counts per draft
+    ref_counts: dict[str, int] = {}
+    rows = db.conn.execute(
+        "SELECT draft_name, COUNT(*) as cnt FROM draft_refs GROUP BY draft_name"
+    ).fetchall()
+    for r in rows:
+        ref_counts[r["draft_name"]] = r["cnt"]
+
+    # Max refs across corpus (single query)
+    max_refs_row = db.conn.execute(
+        "SELECT MAX(cnt) FROM (SELECT COUNT(*) as cnt FROM draft_refs GROUP BY draft_name)"
+    ).fetchone()
+    max_refs = (max_refs_row[0] or 1) if max_refs_row else 1
+
+    # Batch-load cited-by counts
+    cited_by_counts: dict[str, int] = {}
+    rows = db.conn.execute(
+        "SELECT ref_id, COUNT(DISTINCT draft_name) as cnt FROM draft_refs "
+        "WHERE ref_type = 'draft' GROUP BY ref_id"
+    ).fetchall()
+    for r in rows:
+        cited_by_counts[r["ref_id"]] = r["cnt"]
+
+    # Batch-load author experience: person_id -> draft count
+    author_draft_counts: dict[int, int] = {}
+    rows = db.conn.execute(
+        "SELECT person_id, COUNT(*) as cnt FROM draft_authors GROUP BY person_id"
+    ).fetchall()
+    for r in rows:
+        author_draft_counts[r["person_id"]] = r["cnt"]
+
+    # Batch-load draft->author mappings
+    draft_authors: dict[str, list[int]] = {}
+    rows = db.conn.execute(
+        "SELECT draft_name, person_id FROM draft_authors"
+    ).fetchall()
+    for r in rows:
+        draft_authors.setdefault(r["draft_name"], []).append(r["person_id"])
+
+    # Batch-load ratings (momentum)
+    ratings_map: dict[str, float] = {}
+    rows = db.conn.execute(
+        "SELECT draft_name, momentum FROM ratings"
+    ).fetchall()
+    for r in rows:
+        ratings_map[r["draft_name"]] = r["momentum"]
+
+    # Now compute readiness for each draft using pre-loaded data
+    results = {}
+    for name in draft_names:
+        draft = drafts_map.get(name)
+        if not draft:
+            results[name] = {"score": 0, "factors": {}}
+            continue
+
+        factors = {}
+
+        # 1. WG Adopted
+        wg_val = 1.0 if name.startswith("draft-ietf-") else 0.0
+        factors["wg_adopted"] = {"value": wg_val, "weight": 0.25,
+                                 "label": "WG Adopted",
+                                 "detail": "draft-ietf-*" if wg_val else "individual"}
+
+        # 2. Revision Maturity
+        try:
+            rev_num = int(draft.rev) if draft.rev else 0
+        except (ValueError, TypeError):
+            rev_num = 0
+        rev_val = min(rev_num / 5.0, 1.0)
+        factors["revision_maturity"] = {"value": round(rev_val, 3), "weight": 0.15,
+                                        "label": "Revision Maturity",
+                                        "detail": f"rev {rev_num}"}
+
+        # 3. Reference Density
+        ref_count = ref_counts.get(name, 0)
+        ref_val = min(ref_count / max_refs, 1.0)
+        factors["reference_density"] = {"value": round(ref_val, 3), "weight": 0.15,
+                                        "label": "Reference Density",
+                                        "detail": f"{ref_count} refs (max {max_refs})"}
+
+        # 4. Cited By Count
+        cited_by = cited_by_counts.get(name, 0)
+        cited_val = min(cited_by / 5.0, 1.0)
+        factors["cited_by_count"] = {"value": round(cited_val, 3), "weight": 0.15,
+                                     "label": "Cited By Others",
+                                     "detail": f"{cited_by} draft(s)"}
+
+        # 5. Author Experience
+        person_ids = draft_authors.get(name, [])
+        if person_ids:
+            counts = [author_draft_counts.get(pid, 1) for pid in person_ids]
+            avg_exp = sum(counts) / len(counts)
+            exp_val = min(avg_exp / 5.0, 1.0)
+        else:
+            exp_val = 0.0
+            avg_exp = 0
+        factors["author_experience"] = {"value": round(exp_val, 3), "weight": 0.15,
+                                        "label": "Author Experience",
+                                        "detail": f"avg {avg_exp:.1f} drafts/author"}
+
+        # 6. Momentum Rating
+        momentum = ratings_map.get(name)
+        if momentum is not None:
+            mom_val = (momentum - 1) / 4.0
+        else:
+            mom_val = 0.0
+        factors["momentum_rating"] = {"value": round(mom_val, 3), "weight": 0.15,
+                                      "label": "Momentum",
+                                      "detail": f"{momentum}/5" if momentum else "unrated"}
+
+        # Compute weighted score
+        total = sum(f["value"] * f["weight"] for f in factors.values())
+        score = round(total * 100, 1)
+        for f in factors.values():
+            f["contribution"] = round(f["value"] * f["weight"] * 100, 1)
+
+        results[name] = {"score": score, "factors": factors}
+
+    return results
--- a/src/ietf_analyzer/search.py
+++ b/src/ietf_analyzer/search.py
@@ -4,6 +4,7 @@ from __future__ import annotations

 import hashlib
 import re
+import time
 from collections import defaultdict

 import numpy as np
@@ -50,6 +51,9 @@ class HybridSearch:
        self.db = db
        self._embedder = embedder
        self._ollama_available: bool | None = None
+        self._embeddings_cache: dict[str, np.ndarray] | None = None
+        self._embeddings_cache_time: float = 0
+        self._EMBEDDINGS_TTL: float = 300  # 5 minutes

    @property
    def embedder(self):
@@ -79,6 +83,16 @@ class HybridSearch:
            self._ollama_available = False
        return self._ollama_available

+    def _get_all_embeddings(self) -> dict[str, np.ndarray]:
+        """Return all embeddings, cached with TTL to avoid reloading on every query."""
+        now = time.monotonic()
+        if (self._embeddings_cache is not None
+                and now - self._embeddings_cache_time < self._EMBEDDINGS_TTL):
+            return self._embeddings_cache
+        self._embeddings_cache = self.db.all_embeddings()
+        self._embeddings_cache_time = now
+        return self._embeddings_cache
+
    def search(self, query: str, top_k: int = 10) -> list[dict]:
        """Combine FTS5 keyword search + embedding similarity search.

@@ -144,7 +158,7 @@ class HybridSearch:
            self._ollama_available = False
            return []

-        all_embeddings = self.db.all_embeddings()
+        all_embeddings = self._get_all_embeddings()
        if not all_embeddings:
            return []

--- a/src/webui/data.py
+++ b/src/webui/data.py
@@ -7,11 +7,176 @@ ready for JSON serialization or Jinja2 template rendering.
 from __future__ import annotations

 import json
+import re
 import sys
 import time
 from collections import Counter, defaultdict
 from functools import lru_cache
 from pathlib import Path
+from typing import TypedDict
+
+import numpy as np
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.manifold import TSNE
+from sklearn.preprocessing import normalize as sk_normalize
+
+
+# ---------------------------------------------------------------------------
+# TypedDicts for common return shapes
+# ---------------------------------------------------------------------------
+
+class OverviewStats(TypedDict):
+    """High-level dashboard statistics from :func:`get_overview_stats`."""
+    total_drafts: int
+    rated_count: int
+    author_count: int
+    idea_count: int
+    gap_count: int
+    input_tokens: int
+    output_tokens: int
+    false_positive_count: int
+
+
+class DraftListItem(TypedDict):
+    """Single draft in the paginated listing from :func:`get_drafts_page`."""
+    name: str
+    title: str
+    date: str | None
+    url: str
+    pages: int
+    group: str
+    source: str
+    score: float
+    novelty: float
+    maturity: float
+    overlap: float
+    momentum: float
+    relevance: float
+    categories: list[str]
+    summary: str
+    readiness: float
+
+
+class DraftsPage(TypedDict):
+    """Paginated draft listing from :func:`get_drafts_page`."""
+    drafts: list[DraftListItem]
+    total: int
+    page: int
+    per_page: int
+    pages: int
+
+
+class AuthorInfo(TypedDict):
+    """Author entry from :func:`get_top_authors`."""
+    name: str
+    affiliation: str
+    draft_count: int
+    drafts: list[str]
+
+
+class AuthorNetworkNode(TypedDict):
+    """Node in the author network graph."""
+    id: str
+    name: str
+    org: str
+    draft_count: int
+    avg_score: float
+    drafts: list[str]
+
+
+class AuthorNetworkEdge(TypedDict):
+    """Edge in the author network graph."""
+    source: str
+    target: str
+    weight: int
+
+
+class AuthorCluster(TypedDict):
+    """Cluster in the author network."""
+    id: int
+    members: list[str]
+    org_mix: dict[str, int]
+    size: int
+    drafts: list[dict[str, str]]
+    draft_count: int
+
+
+class AuthorNetwork(TypedDict):
+    """Full author network from :func:`get_author_network_full`."""
+    nodes: list[AuthorNetworkNode]
+    edges: list[AuthorNetworkEdge]
+    clusters: list[AuthorCluster]
+
+
+class SimilarityGraphStats(TypedDict):
+    """Stats sub-dict in similarity graph."""
+    node_count: int
+    edge_count: int
+    avg_similarity: float
+
+
+class SimilarityGraph(TypedDict):
+    """Draft similarity network from :func:`get_similarity_graph`."""
+    nodes: list[dict]
+    edges: list[dict]
+    stats: SimilarityGraphStats
+
+
+class TimelineData(TypedDict):
+    """Monthly category counts from :func:`get_timeline_data`."""
+    months: list[str]
+    series: dict[str, list[int]]
+    categories: list[str]
+
+
+class MonitorCost(TypedDict):
+    """Cost sub-dict in monitor status."""
+    input_tokens: int
+    output_tokens: int
+    estimated_usd: float
+
+
+class MonitorPipeline(TypedDict):
+    """Pipeline sub-dict in monitor status."""
+    total_drafts: int
+    rated: int
+    embedded: int
+    with_ideas: int
+    idea_total: int
+    gap_count: int
+
+
+class MonitorStatus(TypedDict):
+    """Monitor status from :func:`get_monitor_status`."""
+    last_run: dict | None
+    runs: list[dict]
+    unprocessed: dict[str, int]
+    total_runs: int
+    pipeline: MonitorPipeline
+    cost: MonitorCost
+
+
+class SearchResults(TypedDict):
+    """Global search results from :func:`global_search`."""
+    drafts: list[dict]
+    ideas: list[dict]
+    authors: list[dict]
+    gaps: list[dict]
+
+
+class CitationGraphStats(TypedDict):
+    """Stats sub-dict in citation graph."""
+    node_count: int
+    edge_count: int
+    rfc_count: int
+    draft_count: int
+
+
+class CitationGraph(TypedDict):
+    """Citation network from :func:`get_citation_graph`."""
+    nodes: list[dict]
+    edges: list[dict]
+    stats: CitationGraphStats

 # Add project root to path so we can import ietf_analyzer
 _project_root = Path(__file__).resolve().parent.parent.parent
@@ -20,6 +185,8 @@ if str(_project_root) not in sys.path:

 from ietf_analyzer.config import Config
 from ietf_analyzer.db import Database
+from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch
+from ietf_analyzer.search import HybridSearch

 def _extract_month(time_str: str | None) -> str:
    """Normalize a date string to YYYY-MM format."""
@@ -55,7 +222,7 @@ def get_db() -> Database:
    return Database(config)


-def get_overview_stats(db: Database) -> dict:
+def get_overview_stats(db: Database) -> OverviewStats:
    """Return high-level stats for the dashboard home page.

    Excludes drafts flagged as false positives from rated counts.
@@ -204,7 +371,7 @@ def get_drafts_page(
    sort: str = "score",
    sort_dir: str = "desc",
    source: str = "",
-) -> dict:
+) -> DraftsPage:
    """Return a paginated, filtered list of drafts with ratings.

    Returns dict with keys: drafts, total, page, per_page, pages.
@@ -262,11 +429,9 @@ def get_drafts_page(
    start = (page - 1) * per_page
    page_items = filtered[start : start + per_page]

-    # Pre-compute readiness for page items (lightweight version)
-    from ietf_analyzer.readiness import compute_readiness
-    readiness_cache = {}
-    for draft, rating in page_items:
-        readiness_cache[draft.name] = compute_readiness(db, draft.name)
+    # Pre-compute readiness in batch (~6 queries total instead of ~200)
+
+    readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items])

    drafts = []
    for draft, rating in page_items:
@@ -350,7 +515,7 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
        }

    # Readiness score
-    from ietf_analyzer.readiness import compute_readiness
+
    result["readiness"] = compute_readiness(db, name)

    # Annotation
@@ -387,7 +552,7 @@ def get_rating_distributions(db: Database) -> dict:
    return dims


-def get_timeline_data(db: Database) -> dict:
+def get_timeline_data(db: Database) -> TimelineData:
    """Return monthly counts by category for timeline chart."""
    pairs = db.drafts_with_ratings(limit=1000)
    all_drafts = db.list_drafts(limit=1000, order_by="time ASC")
@@ -482,7 +647,7 @@ def read_generated_draft(filename: str) -> str | None:
    return path.read_text(errors="replace")


-def get_top_authors(db: Database, limit: int = 30) -> list[dict]:
+def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]:
    """Return top authors by draft count."""
    rows = db.top_authors(limit=limit)
    return [
@@ -561,19 +726,19 @@ def get_coauthor_network(db: Database, min_shared: int = 1) -> dict:
    return {"nodes": nodes, "edges": edges}


-def get_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
+def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
    """Return draft similarity network (cached)."""
    return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold))


-def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
+def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
    """Return draft similarity network for force-directed graph.

    Returns {nodes: [{name, title, category, score}],
             edges: [{source, target, similarity}],
             stats: {node_count, edge_count, avg_similarity}}
    """
-    import numpy as np
+

    embeddings = db.all_embeddings()
    if len(embeddings) < 2:
@@ -639,12 +804,12 @@ def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]:
    ]


-def get_author_network_full(db: Database) -> dict:
+def get_author_network_full(db: Database) -> AuthorNetwork:
    """Return author network (cached for 5 min)."""
    return _cached("author_network", lambda: _compute_author_network_full(db))


-def _compute_author_network_full(db: Database) -> dict:
+def _compute_author_network_full(db: Database) -> AuthorNetwork:
    """Return enriched co-authorship network with avg scores and cluster info.

    Returns {
@@ -704,6 +869,12 @@ def _compute_author_network_full(db: Database) -> dict:
    visited: set[str] = set()
    clusters = []

+    # Batch-load all drafts referenced by authors (avoid N+1 in cluster loop)
+    _all_dn = set()
+    for _ai in author_info.values():
+        _all_dn.update(_ai.get("drafts", []))
+    _all_drafts_map = db.get_drafts_by_names(list(_all_dn))
+
    for node in sorted(node_set):
        if node in visited:
            continue
@@ -728,7 +899,7 @@ def _compute_author_network_full(db: Database) -> dict:
                    org_mix[org] += 1
                for dn in author_info.get(m, {}).get("drafts", []):
                    if dn not in cluster_drafts:
-                        d = db.get_draft(dn)
+                        d = _all_drafts_map.get(dn)
                        cluster_drafts[dn] = d.title[:80] if d else dn
            clusters.append({
                "id": len(clusters),
@@ -756,9 +927,7 @@ def _compute_idea_clusters(db: Database) -> dict:
    a target of ~30 clusters for readable groupings.  Enriches each cluster
    with WG info and category breakdown.
    """
-    import json as _json
-    import numpy as np
-    from sklearn.preprocessing import normalize as sk_normalize
+

    embeddings = db.all_idea_embeddings()
    if not embeddings:
@@ -777,8 +946,8 @@ def _compute_idea_clusters(db: Database) -> dict:
    draft_cats: dict[str, list[str]] = {}
    for r in rating_rows:
        try:
-            draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
-        except (_json.JSONDecodeError, TypeError):
+            draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else []
+        except (json.JSONDecodeError, TypeError):
            draft_cats[r["draft_name"]] = []

    # Build matrix from embeddings that have matching ideas
@@ -792,7 +961,6 @@ def _compute_idea_clusters(db: Database) -> dict:
    # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
    n_target = max(10, min(40, len(idea_ids) // 12))
    try:
-        from sklearn.cluster import AgglomerativeClustering
        clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
        labels = clustering.fit_predict(matrix_norm)
    except Exception:
@@ -877,7 +1045,6 @@ def _compute_idea_clusters(db: Database) -> dict:
    # t-SNE for scatter
    scatter = []
    try:
-        from sklearn.manifold import TSNE
        perp = min(30, len(idea_ids) - 1)
        tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix_norm)
@@ -917,7 +1084,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
    animation frames.  Each point carries a ``month`` field (YYYY-MM) so the
    front-end can build cumulative animation frames.
    """
-    import numpy as np
+

    embeddings = db.all_embeddings()
    if len(embeddings) < 5:
@@ -935,7 +1102,6 @@ def _compute_timeline_animation_data(db: Database) -> dict:
    matrix = np.array([embeddings[n] for n in names])

    try:
-        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
                     random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix)
@@ -975,7 +1141,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
    }


-def get_monitor_status(db: Database) -> dict:
+def get_monitor_status(db: Database) -> MonitorStatus:
    """Return monitoring status data for dashboard."""
    runs = db.get_monitor_runs(limit=20)
    last = runs[0] if runs else None
@@ -1014,12 +1180,12 @@ def get_monitor_status(db: Database) -> dict:
    }


-def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
+def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
    """Return citation graph (cached for 5 min)."""
    return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs))


-def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
+def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
    """Return citation network data for force-directed graph.

    Returns {nodes: [{id, type, title, influence, ...}],
@@ -1131,7 +1297,7 @@ def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
    }


-def global_search(db: Database, query: str) -> dict:
+def global_search(db: Database, query: str) -> SearchResults:
    """Search across drafts (FTS5), ideas, authors, and gaps.

    Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
@@ -1144,7 +1310,6 @@ def global_search(db: Database, query: str) -> dict:

    # 1. Drafts via FTS5
    try:
-        import re
        fts_query = re.sub(r'[^\w\s]', '', q)
        fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE)
        fts_query = re.sub(r'\s+', ' ', fts_query).strip()
@@ -1242,7 +1407,7 @@ def get_landscape_tsne(db: Database) -> list[dict]:

 def _compute_landscape_tsne(db: Database) -> list[dict]:
    """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}]."""
-    import numpy as np
+

    embeddings = db.all_embeddings()
    if len(embeddings) < 5:
@@ -1260,7 +1425,6 @@ def _compute_landscape_tsne(db: Database) -> list[dict]:
    matrix = np.array([embeddings[n] for n in names])

    try:
-        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
                     random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix)
@@ -1295,7 +1459,7 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
        comparison_text: str | None,
    }
    """
-    import numpy as np
+

    drafts_data = []
    all_ideas: dict[str, list[dict]] = {}
@@ -1384,9 +1548,6 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:

 def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
    """Search-only (free) — returns sources + cached answer if available."""
-    from ietf_analyzer.config import Config
-    from ietf_analyzer.search import HybridSearch
-
    config = Config.load()
    searcher = HybridSearch(config, db)
    return searcher.search_only(question, top_k=top_k)
@@ -1394,9 +1555,6 @@ def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:

 def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
    """Run Claude synthesis (costs tokens, result is cached permanently)."""
-    from ietf_analyzer.config import Config
-    from ietf_analyzer.search import HybridSearch
-
    config = Config.load()
    searcher = HybridSearch(config, db)
    return searcher.ask(question, top_k=top_k, cheap=cheap)