Complete remaining medium/low issues: performance, CLI, types, CI, tests

Performance:
- Batch readiness computation (~200 queries → ~6 per page)
- Batch draft lookup in author network (N+1 → single query)
- File-based similarity matrix cache (.npy + metadata sidecar)
- 5-minute TTL embedding cache for search queries

CLI quality:
- Add pass_cfg_db decorator, convert ~30 commands to shared config/db lifecycle
- Add --dry-run to analyze, embed, embed-ideas, ideas, gaps commands
- Move 15+ in-function imports to top of data.py

Types & documentation:
- Add 16 TypedDicts to data.py, annotate 12 function return types
- Add ethics section to Post 06 (premature standardization, power asymmetry)
- Add EU AI Act Article 43 conformity mapping to Post 06
- Add NIS2 and CRA references to Post 04

CI & testing:
- Add GitHub Actions CI workflow (Python 3.11+3.12, ruff, pytest)
- Add API documentation for all 20 endpoints (data/reports/api-docs.md)
- Add 41 new tests (test_analyzer.py, test_search.py) — 64 total pass

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-08 14:06:54 +01:00
parent e7527ad68e
commit 20c45a7eba
14 changed files with 2305 additions and 1238 deletions

View File

@@ -7,11 +7,176 @@ ready for JSON serialization or Jinja2 template rendering.
from __future__ import annotations
import json
import re
import sys
import time
from collections import Counter, defaultdict
from functools import lru_cache
from pathlib import Path
from typing import TypedDict
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize as sk_normalize
# ---------------------------------------------------------------------------
# TypedDicts for common return shapes
# ---------------------------------------------------------------------------
class OverviewStats(TypedDict):
"""High-level dashboard statistics from :func:`get_overview_stats`."""
total_drafts: int
rated_count: int
author_count: int
idea_count: int
gap_count: int
input_tokens: int
output_tokens: int
false_positive_count: int
class DraftListItem(TypedDict):
"""Single draft in the paginated listing from :func:`get_drafts_page`."""
name: str
title: str
date: str | None
url: str
pages: int
group: str
source: str
score: float
novelty: float
maturity: float
overlap: float
momentum: float
relevance: float
categories: list[str]
summary: str
readiness: float
class DraftsPage(TypedDict):
"""Paginated draft listing from :func:`get_drafts_page`."""
drafts: list[DraftListItem]
total: int
page: int
per_page: int
pages: int
class AuthorInfo(TypedDict):
"""Author entry from :func:`get_top_authors`."""
name: str
affiliation: str
draft_count: int
drafts: list[str]
class AuthorNetworkNode(TypedDict):
"""Node in the author network graph."""
id: str
name: str
org: str
draft_count: int
avg_score: float
drafts: list[str]
class AuthorNetworkEdge(TypedDict):
"""Edge in the author network graph."""
source: str
target: str
weight: int
class AuthorCluster(TypedDict):
"""Cluster in the author network."""
id: int
members: list[str]
org_mix: dict[str, int]
size: int
drafts: list[dict[str, str]]
draft_count: int
class AuthorNetwork(TypedDict):
"""Full author network from :func:`get_author_network_full`."""
nodes: list[AuthorNetworkNode]
edges: list[AuthorNetworkEdge]
clusters: list[AuthorCluster]
class SimilarityGraphStats(TypedDict):
"""Stats sub-dict in similarity graph."""
node_count: int
edge_count: int
avg_similarity: float
class SimilarityGraph(TypedDict):
"""Draft similarity network from :func:`get_similarity_graph`."""
nodes: list[dict]
edges: list[dict]
stats: SimilarityGraphStats
class TimelineData(TypedDict):
"""Monthly category counts from :func:`get_timeline_data`."""
months: list[str]
series: dict[str, list[int]]
categories: list[str]
class MonitorCost(TypedDict):
"""Cost sub-dict in monitor status."""
input_tokens: int
output_tokens: int
estimated_usd: float
class MonitorPipeline(TypedDict):
"""Pipeline sub-dict in monitor status."""
total_drafts: int
rated: int
embedded: int
with_ideas: int
idea_total: int
gap_count: int
class MonitorStatus(TypedDict):
"""Monitor status from :func:`get_monitor_status`."""
last_run: dict | None
runs: list[dict]
unprocessed: dict[str, int]
total_runs: int
pipeline: MonitorPipeline
cost: MonitorCost
class SearchResults(TypedDict):
"""Global search results from :func:`global_search`."""
drafts: list[dict]
ideas: list[dict]
authors: list[dict]
gaps: list[dict]
class CitationGraphStats(TypedDict):
"""Stats sub-dict in citation graph."""
node_count: int
edge_count: int
rfc_count: int
draft_count: int
class CitationGraph(TypedDict):
"""Citation network from :func:`get_citation_graph`."""
nodes: list[dict]
edges: list[dict]
stats: CitationGraphStats
# Add project root to path so we can import ietf_analyzer
_project_root = Path(__file__).resolve().parent.parent.parent
@@ -20,6 +185,8 @@ if str(_project_root) not in sys.path:
from ietf_analyzer.config import Config
from ietf_analyzer.db import Database
from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch
from ietf_analyzer.search import HybridSearch
def _extract_month(time_str: str | None) -> str:
"""Normalize a date string to YYYY-MM format."""
@@ -55,7 +222,7 @@ def get_db() -> Database:
return Database(config)
def get_overview_stats(db: Database) -> dict:
def get_overview_stats(db: Database) -> OverviewStats:
"""Return high-level stats for the dashboard home page.
Excludes drafts flagged as false positives from rated counts.
@@ -204,7 +371,7 @@ def get_drafts_page(
sort: str = "score",
sort_dir: str = "desc",
source: str = "",
) -> dict:
) -> DraftsPage:
"""Return a paginated, filtered list of drafts with ratings.
Returns dict with keys: drafts, total, page, per_page, pages.
@@ -262,11 +429,9 @@ def get_drafts_page(
start = (page - 1) * per_page
page_items = filtered[start : start + per_page]
# Pre-compute readiness for page items (lightweight version)
from ietf_analyzer.readiness import compute_readiness
readiness_cache = {}
for draft, rating in page_items:
readiness_cache[draft.name] = compute_readiness(db, draft.name)
# Pre-compute readiness in batch (~6 queries total instead of ~200)
readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items])
drafts = []
for draft, rating in page_items:
@@ -350,7 +515,7 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
}
# Readiness score
from ietf_analyzer.readiness import compute_readiness
result["readiness"] = compute_readiness(db, name)
# Annotation
@@ -387,7 +552,7 @@ def get_rating_distributions(db: Database) -> dict:
return dims
def get_timeline_data(db: Database) -> dict:
def get_timeline_data(db: Database) -> TimelineData:
"""Return monthly counts by category for timeline chart."""
pairs = db.drafts_with_ratings(limit=1000)
all_drafts = db.list_drafts(limit=1000, order_by="time ASC")
@@ -482,7 +647,7 @@ def read_generated_draft(filename: str) -> str | None:
return path.read_text(errors="replace")
def get_top_authors(db: Database, limit: int = 30) -> list[dict]:
def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]:
"""Return top authors by draft count."""
rows = db.top_authors(limit=limit)
return [
@@ -561,19 +726,19 @@ def get_coauthor_network(db: Database, min_shared: int = 1) -> dict:
return {"nodes": nodes, "edges": edges}
def get_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
"""Return draft similarity network (cached)."""
return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold))
def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
"""Return draft similarity network for force-directed graph.
Returns {nodes: [{name, title, category, score}],
edges: [{source, target, similarity}],
stats: {node_count, edge_count, avg_similarity}}
"""
import numpy as np
embeddings = db.all_embeddings()
if len(embeddings) < 2:
@@ -639,12 +804,12 @@ def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]:
]
def get_author_network_full(db: Database) -> dict:
def get_author_network_full(db: Database) -> AuthorNetwork:
"""Return author network (cached for 5 min)."""
return _cached("author_network", lambda: _compute_author_network_full(db))
def _compute_author_network_full(db: Database) -> dict:
def _compute_author_network_full(db: Database) -> AuthorNetwork:
"""Return enriched co-authorship network with avg scores and cluster info.
Returns {
@@ -704,6 +869,12 @@ def _compute_author_network_full(db: Database) -> dict:
visited: set[str] = set()
clusters = []
# Batch-load all drafts referenced by authors (avoid N+1 in cluster loop)
_all_dn = set()
for _ai in author_info.values():
_all_dn.update(_ai.get("drafts", []))
_all_drafts_map = db.get_drafts_by_names(list(_all_dn))
for node in sorted(node_set):
if node in visited:
continue
@@ -728,7 +899,7 @@ def _compute_author_network_full(db: Database) -> dict:
org_mix[org] += 1
for dn in author_info.get(m, {}).get("drafts", []):
if dn not in cluster_drafts:
d = db.get_draft(dn)
d = _all_drafts_map.get(dn)
cluster_drafts[dn] = d.title[:80] if d else dn
clusters.append({
"id": len(clusters),
@@ -756,9 +927,7 @@ def _compute_idea_clusters(db: Database) -> dict:
a target of ~30 clusters for readable groupings. Enriches each cluster
with WG info and category breakdown.
"""
import json as _json
import numpy as np
from sklearn.preprocessing import normalize as sk_normalize
embeddings = db.all_idea_embeddings()
if not embeddings:
@@ -777,8 +946,8 @@ def _compute_idea_clusters(db: Database) -> dict:
draft_cats: dict[str, list[str]] = {}
for r in rating_rows:
try:
draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
except (_json.JSONDecodeError, TypeError):
draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else []
except (json.JSONDecodeError, TypeError):
draft_cats[r["draft_name"]] = []
# Build matrix from embeddings that have matching ideas
@@ -792,7 +961,6 @@ def _compute_idea_clusters(db: Database) -> dict:
# Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
n_target = max(10, min(40, len(idea_ids) // 12))
try:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
labels = clustering.fit_predict(matrix_norm)
except Exception:
@@ -877,7 +1045,6 @@ def _compute_idea_clusters(db: Database) -> dict:
# t-SNE for scatter
scatter = []
try:
from sklearn.manifold import TSNE
perp = min(30, len(idea_ids) - 1)
tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
coords = tsne.fit_transform(matrix_norm)
@@ -917,7 +1084,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
animation frames. Each point carries a ``month`` field (YYYY-MM) so the
front-end can build cumulative animation frames.
"""
import numpy as np
embeddings = db.all_embeddings()
if len(embeddings) < 5:
@@ -935,7 +1102,6 @@ def _compute_timeline_animation_data(db: Database) -> dict:
matrix = np.array([embeddings[n] for n in names])
try:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
random_state=42, max_iter=500)
coords = tsne.fit_transform(matrix)
@@ -975,7 +1141,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
}
def get_monitor_status(db: Database) -> dict:
def get_monitor_status(db: Database) -> MonitorStatus:
"""Return monitoring status data for dashboard."""
runs = db.get_monitor_runs(limit=20)
last = runs[0] if runs else None
@@ -1014,12 +1180,12 @@ def get_monitor_status(db: Database) -> dict:
}
def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
"""Return citation graph (cached for 5 min)."""
return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs))
def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
"""Return citation network data for force-directed graph.
Returns {nodes: [{id, type, title, influence, ...}],
@@ -1131,7 +1297,7 @@ def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
}
def global_search(db: Database, query: str) -> dict:
def global_search(db: Database, query: str) -> SearchResults:
"""Search across drafts (FTS5), ideas, authors, and gaps.
Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
@@ -1144,7 +1310,6 @@ def global_search(db: Database, query: str) -> dict:
# 1. Drafts via FTS5
try:
import re
fts_query = re.sub(r'[^\w\s]', '', q)
fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE)
fts_query = re.sub(r'\s+', ' ', fts_query).strip()
@@ -1242,7 +1407,7 @@ def get_landscape_tsne(db: Database) -> list[dict]:
def _compute_landscape_tsne(db: Database) -> list[dict]:
"""Compute t-SNE from embeddings, return [{name, title, x, y, category, score}]."""
import numpy as np
embeddings = db.all_embeddings()
if len(embeddings) < 5:
@@ -1260,7 +1425,6 @@ def _compute_landscape_tsne(db: Database) -> list[dict]:
matrix = np.array([embeddings[n] for n in names])
try:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
random_state=42, max_iter=500)
coords = tsne.fit_transform(matrix)
@@ -1295,7 +1459,7 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
comparison_text: str | None,
}
"""
import numpy as np
drafts_data = []
all_ideas: dict[str, list[dict]] = {}
@@ -1384,9 +1548,6 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
"""Search-only (free) — returns sources + cached answer if available."""
from ietf_analyzer.config import Config
from ietf_analyzer.search import HybridSearch
config = Config.load()
searcher = HybridSearch(config, db)
return searcher.search_only(question, top_k=top_k)
@@ -1394,9 +1555,6 @@ def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
"""Run Claude synthesis (costs tokens, result is cached permanently)."""
from ietf_analyzer.config import Config
from ietf_analyzer.search import HybridSearch
config = Config.load()
searcher = HybridSearch(config, db)
return searcher.ask(question, top_k=top_k, cheap=cheap)