Complete remaining medium/low issues: performance, CLI, types, CI, tests

Performance: - Batch readiness computation (~200 queries → ~6 per page) - Batch draft lookup in author network (N+1 → single query) - File-based similarity matrix cache (.npy + metadata sidecar) - 5-minute TTL embedding cache for search queries CLI quality: - Add pass_cfg_db decorator, convert ~30 commands to shared config/db lifecycle - Add --dry-run to analyze, embed, embed-ideas, ideas, gaps commands - Move 15+ in-function imports to top of data.py Types & documentation: - Add 16 TypedDicts to data.py, annotate 12 function return types - Add ethics section to Post 06 (premature standardization, power asymmetry) - Add EU AI Act Article 43 conformity mapping to Post 06 - Add NIS2 and CRA references to Post 04 CI & testing: - Add GitHub Actions CI workflow (Python 3.11+3.12, ruff, pytest) - Add API documentation for all 20 endpoints (data/reports/api-docs.md) - Add 41 new tests (test_analyzer.py, test_search.py) — 64 total pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 14:06:54 +01:00
parent e7527ad68e
commit 20c45a7eba
14 changed files with 2305 additions and 1238 deletions
--- a/src/webui/data.py
+++ b/src/webui/data.py
@@ -7,11 +7,176 @@ ready for JSON serialization or Jinja2 template rendering.
 from __future__ import annotations

 import json
+import re
 import sys
 import time
 from collections import Counter, defaultdict
 from functools import lru_cache
 from pathlib import Path
+from typing import TypedDict
+
+import numpy as np
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.manifold import TSNE
+from sklearn.preprocessing import normalize as sk_normalize
+
+
+# ---------------------------------------------------------------------------
+# TypedDicts for common return shapes
+# ---------------------------------------------------------------------------
+
+class OverviewStats(TypedDict):
+    """High-level dashboard statistics from :func:`get_overview_stats`."""
+    total_drafts: int
+    rated_count: int
+    author_count: int
+    idea_count: int
+    gap_count: int
+    input_tokens: int
+    output_tokens: int
+    false_positive_count: int
+
+
+class DraftListItem(TypedDict):
+    """Single draft in the paginated listing from :func:`get_drafts_page`."""
+    name: str
+    title: str
+    date: str | None
+    url: str
+    pages: int
+    group: str
+    source: str
+    score: float
+    novelty: float
+    maturity: float
+    overlap: float
+    momentum: float
+    relevance: float
+    categories: list[str]
+    summary: str
+    readiness: float
+
+
+class DraftsPage(TypedDict):
+    """Paginated draft listing from :func:`get_drafts_page`."""
+    drafts: list[DraftListItem]
+    total: int
+    page: int
+    per_page: int
+    pages: int
+
+
+class AuthorInfo(TypedDict):
+    """Author entry from :func:`get_top_authors`."""
+    name: str
+    affiliation: str
+    draft_count: int
+    drafts: list[str]
+
+
+class AuthorNetworkNode(TypedDict):
+    """Node in the author network graph."""
+    id: str
+    name: str
+    org: str
+    draft_count: int
+    avg_score: float
+    drafts: list[str]
+
+
+class AuthorNetworkEdge(TypedDict):
+    """Edge in the author network graph."""
+    source: str
+    target: str
+    weight: int
+
+
+class AuthorCluster(TypedDict):
+    """Cluster in the author network."""
+    id: int
+    members: list[str]
+    org_mix: dict[str, int]
+    size: int
+    drafts: list[dict[str, str]]
+    draft_count: int
+
+
+class AuthorNetwork(TypedDict):
+    """Full author network from :func:`get_author_network_full`."""
+    nodes: list[AuthorNetworkNode]
+    edges: list[AuthorNetworkEdge]
+    clusters: list[AuthorCluster]
+
+
+class SimilarityGraphStats(TypedDict):
+    """Stats sub-dict in similarity graph."""
+    node_count: int
+    edge_count: int
+    avg_similarity: float
+
+
+class SimilarityGraph(TypedDict):
+    """Draft similarity network from :func:`get_similarity_graph`."""
+    nodes: list[dict]
+    edges: list[dict]
+    stats: SimilarityGraphStats
+
+
+class TimelineData(TypedDict):
+    """Monthly category counts from :func:`get_timeline_data`."""
+    months: list[str]
+    series: dict[str, list[int]]
+    categories: list[str]
+
+
+class MonitorCost(TypedDict):
+    """Cost sub-dict in monitor status."""
+    input_tokens: int
+    output_tokens: int
+    estimated_usd: float
+
+
+class MonitorPipeline(TypedDict):
+    """Pipeline sub-dict in monitor status."""
+    total_drafts: int
+    rated: int
+    embedded: int
+    with_ideas: int
+    idea_total: int
+    gap_count: int
+
+
+class MonitorStatus(TypedDict):
+    """Monitor status from :func:`get_monitor_status`."""
+    last_run: dict | None
+    runs: list[dict]
+    unprocessed: dict[str, int]
+    total_runs: int
+    pipeline: MonitorPipeline
+    cost: MonitorCost
+
+
+class SearchResults(TypedDict):
+    """Global search results from :func:`global_search`."""
+    drafts: list[dict]
+    ideas: list[dict]
+    authors: list[dict]
+    gaps: list[dict]
+
+
+class CitationGraphStats(TypedDict):
+    """Stats sub-dict in citation graph."""
+    node_count: int
+    edge_count: int
+    rfc_count: int
+    draft_count: int
+
+
+class CitationGraph(TypedDict):
+    """Citation network from :func:`get_citation_graph`."""
+    nodes: list[dict]
+    edges: list[dict]
+    stats: CitationGraphStats

 # Add project root to path so we can import ietf_analyzer
 _project_root = Path(__file__).resolve().parent.parent.parent
@@ -20,6 +185,8 @@ if str(_project_root) not in sys.path:

 from ietf_analyzer.config import Config
 from ietf_analyzer.db import Database
+from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch
+from ietf_analyzer.search import HybridSearch

 def _extract_month(time_str: str | None) -> str:
    """Normalize a date string to YYYY-MM format."""
@@ -55,7 +222,7 @@ def get_db() -> Database:
    return Database(config)


-def get_overview_stats(db: Database) -> dict:
+def get_overview_stats(db: Database) -> OverviewStats:
    """Return high-level stats for the dashboard home page.

    Excludes drafts flagged as false positives from rated counts.
@@ -204,7 +371,7 @@ def get_drafts_page(
    sort: str = "score",
    sort_dir: str = "desc",
    source: str = "",
-) -> dict:
+) -> DraftsPage:
    """Return a paginated, filtered list of drafts with ratings.

    Returns dict with keys: drafts, total, page, per_page, pages.
@@ -262,11 +429,9 @@ def get_drafts_page(
    start = (page - 1) * per_page
    page_items = filtered[start : start + per_page]

-    # Pre-compute readiness for page items (lightweight version)
-    from ietf_analyzer.readiness import compute_readiness
-    readiness_cache = {}
-    for draft, rating in page_items:
-        readiness_cache[draft.name] = compute_readiness(db, draft.name)
+    # Pre-compute readiness in batch (~6 queries total instead of ~200)
+
+    readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items])

    drafts = []
    for draft, rating in page_items:
@@ -350,7 +515,7 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
        }

    # Readiness score
-    from ietf_analyzer.readiness import compute_readiness
+
    result["readiness"] = compute_readiness(db, name)

    # Annotation
@@ -387,7 +552,7 @@ def get_rating_distributions(db: Database) -> dict:
    return dims


-def get_timeline_data(db: Database) -> dict:
+def get_timeline_data(db: Database) -> TimelineData:
    """Return monthly counts by category for timeline chart."""
    pairs = db.drafts_with_ratings(limit=1000)
    all_drafts = db.list_drafts(limit=1000, order_by="time ASC")
@@ -482,7 +647,7 @@ def read_generated_draft(filename: str) -> str | None:
    return path.read_text(errors="replace")


-def get_top_authors(db: Database, limit: int = 30) -> list[dict]:
+def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]:
    """Return top authors by draft count."""
    rows = db.top_authors(limit=limit)
    return [
@@ -561,19 +726,19 @@ def get_coauthor_network(db: Database, min_shared: int = 1) -> dict:
    return {"nodes": nodes, "edges": edges}


-def get_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
+def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
    """Return draft similarity network (cached)."""
    return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold))


-def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
+def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
    """Return draft similarity network for force-directed graph.

    Returns {nodes: [{name, title, category, score}],
             edges: [{source, target, similarity}],
             stats: {node_count, edge_count, avg_similarity}}
    """
-    import numpy as np
+

    embeddings = db.all_embeddings()
    if len(embeddings) < 2:
@@ -639,12 +804,12 @@ def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]:
    ]


-def get_author_network_full(db: Database) -> dict:
+def get_author_network_full(db: Database) -> AuthorNetwork:
    """Return author network (cached for 5 min)."""
    return _cached("author_network", lambda: _compute_author_network_full(db))


-def _compute_author_network_full(db: Database) -> dict:
+def _compute_author_network_full(db: Database) -> AuthorNetwork:
    """Return enriched co-authorship network with avg scores and cluster info.

    Returns {
@@ -704,6 +869,12 @@ def _compute_author_network_full(db: Database) -> dict:
    visited: set[str] = set()
    clusters = []

+    # Batch-load all drafts referenced by authors (avoid N+1 in cluster loop)
+    _all_dn = set()
+    for _ai in author_info.values():
+        _all_dn.update(_ai.get("drafts", []))
+    _all_drafts_map = db.get_drafts_by_names(list(_all_dn))
+
    for node in sorted(node_set):
        if node in visited:
            continue
@@ -728,7 +899,7 @@ def _compute_author_network_full(db: Database) -> dict:
                    org_mix[org] += 1
                for dn in author_info.get(m, {}).get("drafts", []):
                    if dn not in cluster_drafts:
-                        d = db.get_draft(dn)
+                        d = _all_drafts_map.get(dn)
                        cluster_drafts[dn] = d.title[:80] if d else dn
            clusters.append({
                "id": len(clusters),
@@ -756,9 +927,7 @@ def _compute_idea_clusters(db: Database) -> dict:
    a target of ~30 clusters for readable groupings.  Enriches each cluster
    with WG info and category breakdown.
    """
-    import json as _json
-    import numpy as np
-    from sklearn.preprocessing import normalize as sk_normalize
+

    embeddings = db.all_idea_embeddings()
    if not embeddings:
@@ -777,8 +946,8 @@ def _compute_idea_clusters(db: Database) -> dict:
    draft_cats: dict[str, list[str]] = {}
    for r in rating_rows:
        try:
-            draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
-        except (_json.JSONDecodeError, TypeError):
+            draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else []
+        except (json.JSONDecodeError, TypeError):
            draft_cats[r["draft_name"]] = []

    # Build matrix from embeddings that have matching ideas
@@ -792,7 +961,6 @@ def _compute_idea_clusters(db: Database) -> dict:
    # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
    n_target = max(10, min(40, len(idea_ids) // 12))
    try:
-        from sklearn.cluster import AgglomerativeClustering
        clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
        labels = clustering.fit_predict(matrix_norm)
    except Exception:
@@ -877,7 +1045,6 @@ def _compute_idea_clusters(db: Database) -> dict:
    # t-SNE for scatter
    scatter = []
    try:
-        from sklearn.manifold import TSNE
        perp = min(30, len(idea_ids) - 1)
        tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix_norm)
@@ -917,7 +1084,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
    animation frames.  Each point carries a ``month`` field (YYYY-MM) so the
    front-end can build cumulative animation frames.
    """
-    import numpy as np
+

    embeddings = db.all_embeddings()
    if len(embeddings) < 5:
@@ -935,7 +1102,6 @@ def _compute_timeline_animation_data(db: Database) -> dict:
    matrix = np.array([embeddings[n] for n in names])

    try:
-        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
                     random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix)
@@ -975,7 +1141,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
    }


-def get_monitor_status(db: Database) -> dict:
+def get_monitor_status(db: Database) -> MonitorStatus:
    """Return monitoring status data for dashboard."""
    runs = db.get_monitor_runs(limit=20)
    last = runs[0] if runs else None
@@ -1014,12 +1180,12 @@ def get_monitor_status(db: Database) -> dict:
    }


-def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
+def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
    """Return citation graph (cached for 5 min)."""
    return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs))


-def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
+def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
    """Return citation network data for force-directed graph.

    Returns {nodes: [{id, type, title, influence, ...}],
@@ -1131,7 +1297,7 @@ def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
    }


-def global_search(db: Database, query: str) -> dict:
+def global_search(db: Database, query: str) -> SearchResults:
    """Search across drafts (FTS5), ideas, authors, and gaps.

    Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
@@ -1144,7 +1310,6 @@ def global_search(db: Database, query: str) -> dict:

    # 1. Drafts via FTS5
    try:
-        import re
        fts_query = re.sub(r'[^\w\s]', '', q)
        fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE)
        fts_query = re.sub(r'\s+', ' ', fts_query).strip()
@@ -1242,7 +1407,7 @@ def get_landscape_tsne(db: Database) -> list[dict]:

 def _compute_landscape_tsne(db: Database) -> list[dict]:
    """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}]."""
-    import numpy as np
+

    embeddings = db.all_embeddings()
    if len(embeddings) < 5:
@@ -1260,7 +1425,6 @@ def _compute_landscape_tsne(db: Database) -> list[dict]:
    matrix = np.array([embeddings[n] for n in names])

    try:
-        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
                     random_state=42, max_iter=500)
        coords = tsne.fit_transform(matrix)
@@ -1295,7 +1459,7 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
        comparison_text: str | None,
    }
    """
-    import numpy as np
+

    drafts_data = []
    all_ideas: dict[str, list[dict]] = {}
@@ -1384,9 +1548,6 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:

 def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
    """Search-only (free) — returns sources + cached answer if available."""
-    from ietf_analyzer.config import Config
-    from ietf_analyzer.search import HybridSearch
-
    config = Config.load()
    searcher = HybridSearch(config, db)
    return searcher.search_only(question, top_k=top_k)
@@ -1394,9 +1555,6 @@ def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:

 def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
    """Run Claude synthesis (costs tokens, result is cached permanently)."""
-    from ietf_analyzer.config import Config
-    from ietf_analyzer.search import HybridSearch
-
    config = Config.load()
    searcher = HybridSearch(config, db)
    return searcher.ask(question, top_k=top_k, cheap=cheap)