Complete remaining medium/low issues: performance, CLI, types, CI, tests
Performance: - Batch readiness computation (~200 queries → ~6 per page) - Batch draft lookup in author network (N+1 → single query) - File-based similarity matrix cache (.npy + metadata sidecar) - 5-minute TTL embedding cache for search queries CLI quality: - Add pass_cfg_db decorator, convert ~30 commands to shared config/db lifecycle - Add --dry-run to analyze, embed, embed-ideas, ideas, gaps commands - Move 15+ in-function imports to top of data.py Types & documentation: - Add 16 TypedDicts to data.py, annotate 12 function return types - Add ethics section to Post 06 (premature standardization, power asymmetry) - Add EU AI Act Article 43 conformity mapping to Post 06 - Add NIS2 and CRA references to Post 04 CI & testing: - Add GitHub Actions CI workflow (Python 3.11+3.12, ruff, pytest) - Add API documentation for all 20 endpoints (data/reports/api-docs.md) - Add 41 new tests (test_analyzer.py, test_search.py) — 64 total pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,11 +7,176 @@ ready for JSON serialization or Jinja2 template rendering.
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TypedDict
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.preprocessing import normalize as sk_normalize
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TypedDicts for common return shapes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class OverviewStats(TypedDict):
|
||||
"""High-level dashboard statistics from :func:`get_overview_stats`."""
|
||||
total_drafts: int
|
||||
rated_count: int
|
||||
author_count: int
|
||||
idea_count: int
|
||||
gap_count: int
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
false_positive_count: int
|
||||
|
||||
|
||||
class DraftListItem(TypedDict):
|
||||
"""Single draft in the paginated listing from :func:`get_drafts_page`."""
|
||||
name: str
|
||||
title: str
|
||||
date: str | None
|
||||
url: str
|
||||
pages: int
|
||||
group: str
|
||||
source: str
|
||||
score: float
|
||||
novelty: float
|
||||
maturity: float
|
||||
overlap: float
|
||||
momentum: float
|
||||
relevance: float
|
||||
categories: list[str]
|
||||
summary: str
|
||||
readiness: float
|
||||
|
||||
|
||||
class DraftsPage(TypedDict):
|
||||
"""Paginated draft listing from :func:`get_drafts_page`."""
|
||||
drafts: list[DraftListItem]
|
||||
total: int
|
||||
page: int
|
||||
per_page: int
|
||||
pages: int
|
||||
|
||||
|
||||
class AuthorInfo(TypedDict):
|
||||
"""Author entry from :func:`get_top_authors`."""
|
||||
name: str
|
||||
affiliation: str
|
||||
draft_count: int
|
||||
drafts: list[str]
|
||||
|
||||
|
||||
class AuthorNetworkNode(TypedDict):
|
||||
"""Node in the author network graph."""
|
||||
id: str
|
||||
name: str
|
||||
org: str
|
||||
draft_count: int
|
||||
avg_score: float
|
||||
drafts: list[str]
|
||||
|
||||
|
||||
class AuthorNetworkEdge(TypedDict):
|
||||
"""Edge in the author network graph."""
|
||||
source: str
|
||||
target: str
|
||||
weight: int
|
||||
|
||||
|
||||
class AuthorCluster(TypedDict):
|
||||
"""Cluster in the author network."""
|
||||
id: int
|
||||
members: list[str]
|
||||
org_mix: dict[str, int]
|
||||
size: int
|
||||
drafts: list[dict[str, str]]
|
||||
draft_count: int
|
||||
|
||||
|
||||
class AuthorNetwork(TypedDict):
|
||||
"""Full author network from :func:`get_author_network_full`."""
|
||||
nodes: list[AuthorNetworkNode]
|
||||
edges: list[AuthorNetworkEdge]
|
||||
clusters: list[AuthorCluster]
|
||||
|
||||
|
||||
class SimilarityGraphStats(TypedDict):
|
||||
"""Stats sub-dict in similarity graph."""
|
||||
node_count: int
|
||||
edge_count: int
|
||||
avg_similarity: float
|
||||
|
||||
|
||||
class SimilarityGraph(TypedDict):
|
||||
"""Draft similarity network from :func:`get_similarity_graph`."""
|
||||
nodes: list[dict]
|
||||
edges: list[dict]
|
||||
stats: SimilarityGraphStats
|
||||
|
||||
|
||||
class TimelineData(TypedDict):
|
||||
"""Monthly category counts from :func:`get_timeline_data`."""
|
||||
months: list[str]
|
||||
series: dict[str, list[int]]
|
||||
categories: list[str]
|
||||
|
||||
|
||||
class MonitorCost(TypedDict):
|
||||
"""Cost sub-dict in monitor status."""
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
estimated_usd: float
|
||||
|
||||
|
||||
class MonitorPipeline(TypedDict):
|
||||
"""Pipeline sub-dict in monitor status."""
|
||||
total_drafts: int
|
||||
rated: int
|
||||
embedded: int
|
||||
with_ideas: int
|
||||
idea_total: int
|
||||
gap_count: int
|
||||
|
||||
|
||||
class MonitorStatus(TypedDict):
|
||||
"""Monitor status from :func:`get_monitor_status`."""
|
||||
last_run: dict | None
|
||||
runs: list[dict]
|
||||
unprocessed: dict[str, int]
|
||||
total_runs: int
|
||||
pipeline: MonitorPipeline
|
||||
cost: MonitorCost
|
||||
|
||||
|
||||
class SearchResults(TypedDict):
|
||||
"""Global search results from :func:`global_search`."""
|
||||
drafts: list[dict]
|
||||
ideas: list[dict]
|
||||
authors: list[dict]
|
||||
gaps: list[dict]
|
||||
|
||||
|
||||
class CitationGraphStats(TypedDict):
|
||||
"""Stats sub-dict in citation graph."""
|
||||
node_count: int
|
||||
edge_count: int
|
||||
rfc_count: int
|
||||
draft_count: int
|
||||
|
||||
|
||||
class CitationGraph(TypedDict):
|
||||
"""Citation network from :func:`get_citation_graph`."""
|
||||
nodes: list[dict]
|
||||
edges: list[dict]
|
||||
stats: CitationGraphStats
|
||||
|
||||
# Add project root to path so we can import ietf_analyzer
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent
|
||||
@@ -20,6 +185,8 @@ if str(_project_root) not in sys.path:
|
||||
|
||||
from ietf_analyzer.config import Config
|
||||
from ietf_analyzer.db import Database
|
||||
from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch
|
||||
from ietf_analyzer.search import HybridSearch
|
||||
|
||||
def _extract_month(time_str: str | None) -> str:
|
||||
"""Normalize a date string to YYYY-MM format."""
|
||||
@@ -55,7 +222,7 @@ def get_db() -> Database:
|
||||
return Database(config)
|
||||
|
||||
|
||||
def get_overview_stats(db: Database) -> dict:
|
||||
def get_overview_stats(db: Database) -> OverviewStats:
|
||||
"""Return high-level stats for the dashboard home page.
|
||||
|
||||
Excludes drafts flagged as false positives from rated counts.
|
||||
@@ -204,7 +371,7 @@ def get_drafts_page(
|
||||
sort: str = "score",
|
||||
sort_dir: str = "desc",
|
||||
source: str = "",
|
||||
) -> dict:
|
||||
) -> DraftsPage:
|
||||
"""Return a paginated, filtered list of drafts with ratings.
|
||||
|
||||
Returns dict with keys: drafts, total, page, per_page, pages.
|
||||
@@ -262,11 +429,9 @@ def get_drafts_page(
|
||||
start = (page - 1) * per_page
|
||||
page_items = filtered[start : start + per_page]
|
||||
|
||||
# Pre-compute readiness for page items (lightweight version)
|
||||
from ietf_analyzer.readiness import compute_readiness
|
||||
readiness_cache = {}
|
||||
for draft, rating in page_items:
|
||||
readiness_cache[draft.name] = compute_readiness(db, draft.name)
|
||||
# Pre-compute readiness in batch (~6 queries total instead of ~200)
|
||||
|
||||
readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items])
|
||||
|
||||
drafts = []
|
||||
for draft, rating in page_items:
|
||||
@@ -350,7 +515,7 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
|
||||
}
|
||||
|
||||
# Readiness score
|
||||
from ietf_analyzer.readiness import compute_readiness
|
||||
|
||||
result["readiness"] = compute_readiness(db, name)
|
||||
|
||||
# Annotation
|
||||
@@ -387,7 +552,7 @@ def get_rating_distributions(db: Database) -> dict:
|
||||
return dims
|
||||
|
||||
|
||||
def get_timeline_data(db: Database) -> dict:
|
||||
def get_timeline_data(db: Database) -> TimelineData:
|
||||
"""Return monthly counts by category for timeline chart."""
|
||||
pairs = db.drafts_with_ratings(limit=1000)
|
||||
all_drafts = db.list_drafts(limit=1000, order_by="time ASC")
|
||||
@@ -482,7 +647,7 @@ def read_generated_draft(filename: str) -> str | None:
|
||||
return path.read_text(errors="replace")
|
||||
|
||||
|
||||
def get_top_authors(db: Database, limit: int = 30) -> list[dict]:
|
||||
def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]:
|
||||
"""Return top authors by draft count."""
|
||||
rows = db.top_authors(limit=limit)
|
||||
return [
|
||||
@@ -561,19 +726,19 @@ def get_coauthor_network(db: Database, min_shared: int = 1) -> dict:
|
||||
return {"nodes": nodes, "edges": edges}
|
||||
|
||||
|
||||
def get_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
|
||||
def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
|
||||
"""Return draft similarity network (cached)."""
|
||||
return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold))
|
||||
|
||||
|
||||
def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
|
||||
def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
|
||||
"""Return draft similarity network for force-directed graph.
|
||||
|
||||
Returns {nodes: [{name, title, category, score}],
|
||||
edges: [{source, target, similarity}],
|
||||
stats: {node_count, edge_count, avg_similarity}}
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
embeddings = db.all_embeddings()
|
||||
if len(embeddings) < 2:
|
||||
@@ -639,12 +804,12 @@ def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]:
|
||||
]
|
||||
|
||||
|
||||
def get_author_network_full(db: Database) -> dict:
|
||||
def get_author_network_full(db: Database) -> AuthorNetwork:
|
||||
"""Return author network (cached for 5 min)."""
|
||||
return _cached("author_network", lambda: _compute_author_network_full(db))
|
||||
|
||||
|
||||
def _compute_author_network_full(db: Database) -> dict:
|
||||
def _compute_author_network_full(db: Database) -> AuthorNetwork:
|
||||
"""Return enriched co-authorship network with avg scores and cluster info.
|
||||
|
||||
Returns {
|
||||
@@ -704,6 +869,12 @@ def _compute_author_network_full(db: Database) -> dict:
|
||||
visited: set[str] = set()
|
||||
clusters = []
|
||||
|
||||
# Batch-load all drafts referenced by authors (avoid N+1 in cluster loop)
|
||||
_all_dn = set()
|
||||
for _ai in author_info.values():
|
||||
_all_dn.update(_ai.get("drafts", []))
|
||||
_all_drafts_map = db.get_drafts_by_names(list(_all_dn))
|
||||
|
||||
for node in sorted(node_set):
|
||||
if node in visited:
|
||||
continue
|
||||
@@ -728,7 +899,7 @@ def _compute_author_network_full(db: Database) -> dict:
|
||||
org_mix[org] += 1
|
||||
for dn in author_info.get(m, {}).get("drafts", []):
|
||||
if dn not in cluster_drafts:
|
||||
d = db.get_draft(dn)
|
||||
d = _all_drafts_map.get(dn)
|
||||
cluster_drafts[dn] = d.title[:80] if d else dn
|
||||
clusters.append({
|
||||
"id": len(clusters),
|
||||
@@ -756,9 +927,7 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
a target of ~30 clusters for readable groupings. Enriches each cluster
|
||||
with WG info and category breakdown.
|
||||
"""
|
||||
import json as _json
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import normalize as sk_normalize
|
||||
|
||||
|
||||
embeddings = db.all_idea_embeddings()
|
||||
if not embeddings:
|
||||
@@ -777,8 +946,8 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
draft_cats: dict[str, list[str]] = {}
|
||||
for r in rating_rows:
|
||||
try:
|
||||
draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
|
||||
except (_json.JSONDecodeError, TypeError):
|
||||
draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else []
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
draft_cats[r["draft_name"]] = []
|
||||
|
||||
# Build matrix from embeddings that have matching ideas
|
||||
@@ -792,7 +961,6 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
# Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
|
||||
n_target = max(10, min(40, len(idea_ids) // 12))
|
||||
try:
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
|
||||
labels = clustering.fit_predict(matrix_norm)
|
||||
except Exception:
|
||||
@@ -877,7 +1045,6 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
# t-SNE for scatter
|
||||
scatter = []
|
||||
try:
|
||||
from sklearn.manifold import TSNE
|
||||
perp = min(30, len(idea_ids) - 1)
|
||||
tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
|
||||
coords = tsne.fit_transform(matrix_norm)
|
||||
@@ -917,7 +1084,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
|
||||
animation frames. Each point carries a ``month`` field (YYYY-MM) so the
|
||||
front-end can build cumulative animation frames.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
embeddings = db.all_embeddings()
|
||||
if len(embeddings) < 5:
|
||||
@@ -935,7 +1102,6 @@ def _compute_timeline_animation_data(db: Database) -> dict:
|
||||
matrix = np.array([embeddings[n] for n in names])
|
||||
|
||||
try:
|
||||
from sklearn.manifold import TSNE
|
||||
tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
|
||||
random_state=42, max_iter=500)
|
||||
coords = tsne.fit_transform(matrix)
|
||||
@@ -975,7 +1141,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def get_monitor_status(db: Database) -> dict:
|
||||
def get_monitor_status(db: Database) -> MonitorStatus:
|
||||
"""Return monitoring status data for dashboard."""
|
||||
runs = db.get_monitor_runs(limit=20)
|
||||
last = runs[0] if runs else None
|
||||
@@ -1014,12 +1180,12 @@ def get_monitor_status(db: Database) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
|
||||
def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
|
||||
"""Return citation graph (cached for 5 min)."""
|
||||
return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs))
|
||||
|
||||
|
||||
def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
|
||||
def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
|
||||
"""Return citation network data for force-directed graph.
|
||||
|
||||
Returns {nodes: [{id, type, title, influence, ...}],
|
||||
@@ -1131,7 +1297,7 @@ def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def global_search(db: Database, query: str) -> dict:
|
||||
def global_search(db: Database, query: str) -> SearchResults:
|
||||
"""Search across drafts (FTS5), ideas, authors, and gaps.
|
||||
|
||||
Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
|
||||
@@ -1144,7 +1310,6 @@ def global_search(db: Database, query: str) -> dict:
|
||||
|
||||
# 1. Drafts via FTS5
|
||||
try:
|
||||
import re
|
||||
fts_query = re.sub(r'[^\w\s]', '', q)
|
||||
fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE)
|
||||
fts_query = re.sub(r'\s+', ' ', fts_query).strip()
|
||||
@@ -1242,7 +1407,7 @@ def get_landscape_tsne(db: Database) -> list[dict]:
|
||||
|
||||
def _compute_landscape_tsne(db: Database) -> list[dict]:
|
||||
"""Compute t-SNE from embeddings, return [{name, title, x, y, category, score}]."""
|
||||
import numpy as np
|
||||
|
||||
|
||||
embeddings = db.all_embeddings()
|
||||
if len(embeddings) < 5:
|
||||
@@ -1260,7 +1425,6 @@ def _compute_landscape_tsne(db: Database) -> list[dict]:
|
||||
matrix = np.array([embeddings[n] for n in names])
|
||||
|
||||
try:
|
||||
from sklearn.manifold import TSNE
|
||||
tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
|
||||
random_state=42, max_iter=500)
|
||||
coords = tsne.fit_transform(matrix)
|
||||
@@ -1295,7 +1459,7 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
|
||||
comparison_text: str | None,
|
||||
}
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
drafts_data = []
|
||||
all_ideas: dict[str, list[dict]] = {}
|
||||
@@ -1384,9 +1548,6 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
|
||||
|
||||
def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
|
||||
"""Search-only (free) — returns sources + cached answer if available."""
|
||||
from ietf_analyzer.config import Config
|
||||
from ietf_analyzer.search import HybridSearch
|
||||
|
||||
config = Config.load()
|
||||
searcher = HybridSearch(config, db)
|
||||
return searcher.search_only(question, top_k=top_k)
|
||||
@@ -1394,9 +1555,6 @@ def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
|
||||
|
||||
def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
|
||||
"""Run Claude synthesis (costs tokens, result is cached permanently)."""
|
||||
from ietf_analyzer.config import Config
|
||||
from ietf_analyzer.search import HybridSearch
|
||||
|
||||
config = Config.load()
|
||||
searcher = HybridSearch(config, db)
|
||||
return searcher.ask(question, top_k=top_k, cheap=cheap)
|
||||
|
||||
Reference in New Issue
Block a user