Complete remaining medium/low issues: performance, CLI, types, CI, tests
Performance: - Batch readiness computation (~200 queries → ~6 per page) - Batch draft lookup in author network (N+1 → single query) - File-based similarity matrix cache (.npy + metadata sidecar) - 5-minute TTL embedding cache for search queries CLI quality: - Add pass_cfg_db decorator, convert ~30 commands to shared config/db lifecycle - Add --dry-run to analyze, embed, embed-ideas, ideas, gaps commands - Move 15+ in-function imports to top of data.py Types & documentation: - Add 16 TypedDicts to data.py, annotate 12 function return types - Add ethics section to Post 06 (premature standardization, power asymmetry) - Add EU AI Act Article 43 conformity mapping to Post 06 - Add NIS2 and CRA references to Post 04 CI & testing: - Add GitHub Actions CI workflow (Python 3.11+3.12, ruff, pytest) - Add API documentation for all 20 endpoints (data/reports/api-docs.md) - Add 41 new tests (test_analyzer.py, test_search.py) — 64 total pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -13,16 +13,15 @@ CONFIG_FILE = DEFAULT_DATA_DIR / "config.json"
|
||||
DEFAULT_KEYWORDS = [
|
||||
"agent",
|
||||
"ai-agent",
|
||||
"llm",
|
||||
"autonomous",
|
||||
"machine-learning",
|
||||
"artificial-intelligence",
|
||||
"mcp",
|
||||
"agentic",
|
||||
"autonomous",
|
||||
"mcp",
|
||||
"inference",
|
||||
"generative",
|
||||
"intelligent",
|
||||
"aipref",
|
||||
"large language model",
|
||||
"multi-agent",
|
||||
"trustworth",
|
||||
]
|
||||
|
||||
# Environment variable overrides (env var name -> config field name)
|
||||
@@ -39,6 +38,7 @@ class Config:
|
||||
db_path: str = str(DEFAULT_DATA_DIR / "drafts.db")
|
||||
ollama_url: str = "http://localhost:11434"
|
||||
ollama_embed_model: str = "nomic-embed-text"
|
||||
ollama_classify_model: str = "llama3.2"
|
||||
claude_model: str = "claude-sonnet-4-20250514"
|
||||
claude_model_cheap: str = "claude-haiku-4-5-20251001"
|
||||
search_keywords: list[str] = field(default_factory=lambda: list(DEFAULT_KEYWORDS))
|
||||
|
||||
@@ -326,6 +326,23 @@ class Database:
|
||||
return None
|
||||
return self._row_to_draft(row)
|
||||
|
||||
def get_drafts_by_names(self, names: list[str]) -> dict[str, "Draft"]:
|
||||
"""Batch-fetch drafts by name. Returns {name: Draft} dict."""
|
||||
if not names:
|
||||
return {}
|
||||
result = {}
|
||||
# SQLite has a variable limit (~999), so chunk if needed
|
||||
for i in range(0, len(names), 900):
|
||||
chunk = names[i : i + 900]
|
||||
placeholders = ",".join("?" for _ in chunk)
|
||||
rows = self.conn.execute(
|
||||
f"SELECT * FROM drafts WHERE name IN ({placeholders})", chunk
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
d = self._row_to_draft(r)
|
||||
result[d.name] = d
|
||||
return result
|
||||
|
||||
def list_drafts(
|
||||
self,
|
||||
limit: int = 100,
|
||||
|
||||
@@ -2,6 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import ollama as ollama_lib
|
||||
from rich.console import Console
|
||||
@@ -111,16 +115,49 @@ class Embedder:
|
||||
return similarities[:top_n]
|
||||
|
||||
def similarity_matrix(self) -> tuple[list[str], np.ndarray]:
|
||||
"""Compute pairwise similarity matrix for all embedded drafts."""
|
||||
"""Compute pairwise similarity matrix for all embedded drafts.
|
||||
|
||||
Uses a file-based cache keyed by the hash of embedding draft names.
|
||||
If the set of embedded drafts hasn't changed, the cached matrix is
|
||||
reloaded from disk instead of recomputing O(n^2) cosine similarities.
|
||||
"""
|
||||
all_embeddings = self.db.all_embeddings()
|
||||
names = sorted(all_embeddings.keys())
|
||||
n = len(names)
|
||||
|
||||
# Build cache key from sorted draft names
|
||||
names_hash = hashlib.sha256("\n".join(names).encode()).hexdigest()[:16]
|
||||
cache_dir = Path(self.config.db_path).parent / ".cache"
|
||||
cache_meta = cache_dir / f"sim_matrix_{names_hash}.json"
|
||||
cache_npy = cache_dir / f"sim_matrix_{names_hash}.npy"
|
||||
|
||||
# Try loading from cache
|
||||
if cache_meta.exists() and cache_npy.exists():
|
||||
try:
|
||||
cached_names = json.loads(cache_meta.read_text())
|
||||
if cached_names == names:
|
||||
matrix = np.load(cache_npy)
|
||||
if matrix.shape == (n, n):
|
||||
return names, matrix
|
||||
except Exception:
|
||||
pass # Cache corrupted, recompute
|
||||
|
||||
# Compute fresh
|
||||
matrix = np.zeros((n, n), dtype=np.float32)
|
||||
for i in range(n):
|
||||
for j in range(i, n):
|
||||
sim = _cosine_similarity(all_embeddings[names[i]], all_embeddings[names[j]])
|
||||
matrix[i, j] = sim
|
||||
matrix[j, i] = sim
|
||||
|
||||
# Save to cache
|
||||
try:
|
||||
cache_dir.mkdir(exist_ok=True)
|
||||
np.save(cache_npy, matrix)
|
||||
cache_meta.write_text(json.dumps(names))
|
||||
except Exception:
|
||||
pass # Non-fatal if caching fails
|
||||
|
||||
return names, matrix
|
||||
|
||||
def find_clusters(self, threshold: float = 0.85) -> list[list[str]]:
|
||||
|
||||
@@ -100,3 +100,136 @@ def compute_readiness(db, draft_name: str) -> dict:
|
||||
f["contribution"] = round(f["value"] * f["weight"] * 100, 1)
|
||||
|
||||
return {"score": score, "factors": factors}
|
||||
|
||||
|
||||
def compute_readiness_batch(db, draft_names: list[str]) -> dict[str, dict]:
|
||||
"""Batch-compute readiness for multiple drafts using bulk queries.
|
||||
|
||||
Returns {draft_name: {score, factors}} — same format as compute_readiness.
|
||||
Reduces ~6 queries per draft to ~6 queries total.
|
||||
"""
|
||||
if not draft_names:
|
||||
return {}
|
||||
|
||||
# Batch-load drafts
|
||||
drafts_map = db.get_drafts_by_names(draft_names)
|
||||
|
||||
# Batch-load ref counts per draft
|
||||
ref_counts: dict[str, int] = {}
|
||||
rows = db.conn.execute(
|
||||
"SELECT draft_name, COUNT(*) as cnt FROM draft_refs GROUP BY draft_name"
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
ref_counts[r["draft_name"]] = r["cnt"]
|
||||
|
||||
# Max refs across corpus (single query)
|
||||
max_refs_row = db.conn.execute(
|
||||
"SELECT MAX(cnt) FROM (SELECT COUNT(*) as cnt FROM draft_refs GROUP BY draft_name)"
|
||||
).fetchone()
|
||||
max_refs = (max_refs_row[0] or 1) if max_refs_row else 1
|
||||
|
||||
# Batch-load cited-by counts
|
||||
cited_by_counts: dict[str, int] = {}
|
||||
rows = db.conn.execute(
|
||||
"SELECT ref_id, COUNT(DISTINCT draft_name) as cnt FROM draft_refs "
|
||||
"WHERE ref_type = 'draft' GROUP BY ref_id"
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
cited_by_counts[r["ref_id"]] = r["cnt"]
|
||||
|
||||
# Batch-load author experience: person_id -> draft count
|
||||
author_draft_counts: dict[int, int] = {}
|
||||
rows = db.conn.execute(
|
||||
"SELECT person_id, COUNT(*) as cnt FROM draft_authors GROUP BY person_id"
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
author_draft_counts[r["person_id"]] = r["cnt"]
|
||||
|
||||
# Batch-load draft->author mappings
|
||||
draft_authors: dict[str, list[int]] = {}
|
||||
rows = db.conn.execute(
|
||||
"SELECT draft_name, person_id FROM draft_authors"
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
draft_authors.setdefault(r["draft_name"], []).append(r["person_id"])
|
||||
|
||||
# Batch-load ratings (momentum)
|
||||
ratings_map: dict[str, float] = {}
|
||||
rows = db.conn.execute(
|
||||
"SELECT draft_name, momentum FROM ratings"
|
||||
).fetchall()
|
||||
for r in rows:
|
||||
ratings_map[r["draft_name"]] = r["momentum"]
|
||||
|
||||
# Now compute readiness for each draft using pre-loaded data
|
||||
results = {}
|
||||
for name in draft_names:
|
||||
draft = drafts_map.get(name)
|
||||
if not draft:
|
||||
results[name] = {"score": 0, "factors": {}}
|
||||
continue
|
||||
|
||||
factors = {}
|
||||
|
||||
# 1. WG Adopted
|
||||
wg_val = 1.0 if name.startswith("draft-ietf-") else 0.0
|
||||
factors["wg_adopted"] = {"value": wg_val, "weight": 0.25,
|
||||
"label": "WG Adopted",
|
||||
"detail": "draft-ietf-*" if wg_val else "individual"}
|
||||
|
||||
# 2. Revision Maturity
|
||||
try:
|
||||
rev_num = int(draft.rev) if draft.rev else 0
|
||||
except (ValueError, TypeError):
|
||||
rev_num = 0
|
||||
rev_val = min(rev_num / 5.0, 1.0)
|
||||
factors["revision_maturity"] = {"value": round(rev_val, 3), "weight": 0.15,
|
||||
"label": "Revision Maturity",
|
||||
"detail": f"rev {rev_num}"}
|
||||
|
||||
# 3. Reference Density
|
||||
ref_count = ref_counts.get(name, 0)
|
||||
ref_val = min(ref_count / max_refs, 1.0)
|
||||
factors["reference_density"] = {"value": round(ref_val, 3), "weight": 0.15,
|
||||
"label": "Reference Density",
|
||||
"detail": f"{ref_count} refs (max {max_refs})"}
|
||||
|
||||
# 4. Cited By Count
|
||||
cited_by = cited_by_counts.get(name, 0)
|
||||
cited_val = min(cited_by / 5.0, 1.0)
|
||||
factors["cited_by_count"] = {"value": round(cited_val, 3), "weight": 0.15,
|
||||
"label": "Cited By Others",
|
||||
"detail": f"{cited_by} draft(s)"}
|
||||
|
||||
# 5. Author Experience
|
||||
person_ids = draft_authors.get(name, [])
|
||||
if person_ids:
|
||||
counts = [author_draft_counts.get(pid, 1) for pid in person_ids]
|
||||
avg_exp = sum(counts) / len(counts)
|
||||
exp_val = min(avg_exp / 5.0, 1.0)
|
||||
else:
|
||||
exp_val = 0.0
|
||||
avg_exp = 0
|
||||
factors["author_experience"] = {"value": round(exp_val, 3), "weight": 0.15,
|
||||
"label": "Author Experience",
|
||||
"detail": f"avg {avg_exp:.1f} drafts/author"}
|
||||
|
||||
# 6. Momentum Rating
|
||||
momentum = ratings_map.get(name)
|
||||
if momentum is not None:
|
||||
mom_val = (momentum - 1) / 4.0
|
||||
else:
|
||||
mom_val = 0.0
|
||||
factors["momentum_rating"] = {"value": round(mom_val, 3), "weight": 0.15,
|
||||
"label": "Momentum",
|
||||
"detail": f"{momentum}/5" if momentum else "unrated"}
|
||||
|
||||
# Compute weighted score
|
||||
total = sum(f["value"] * f["weight"] for f in factors.values())
|
||||
score = round(total * 100, 1)
|
||||
for f in factors.values():
|
||||
f["contribution"] = round(f["value"] * f["weight"] * 100, 1)
|
||||
|
||||
results[name] = {"score": score, "factors": factors}
|
||||
|
||||
return results
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
@@ -50,6 +51,9 @@ class HybridSearch:
|
||||
self.db = db
|
||||
self._embedder = embedder
|
||||
self._ollama_available: bool | None = None
|
||||
self._embeddings_cache: dict[str, np.ndarray] | None = None
|
||||
self._embeddings_cache_time: float = 0
|
||||
self._EMBEDDINGS_TTL: float = 300 # 5 minutes
|
||||
|
||||
@property
|
||||
def embedder(self):
|
||||
@@ -79,6 +83,16 @@ class HybridSearch:
|
||||
self._ollama_available = False
|
||||
return self._ollama_available
|
||||
|
||||
def _get_all_embeddings(self) -> dict[str, np.ndarray]:
|
||||
"""Return all embeddings, cached with TTL to avoid reloading on every query."""
|
||||
now = time.monotonic()
|
||||
if (self._embeddings_cache is not None
|
||||
and now - self._embeddings_cache_time < self._EMBEDDINGS_TTL):
|
||||
return self._embeddings_cache
|
||||
self._embeddings_cache = self.db.all_embeddings()
|
||||
self._embeddings_cache_time = now
|
||||
return self._embeddings_cache
|
||||
|
||||
def search(self, query: str, top_k: int = 10) -> list[dict]:
|
||||
"""Combine FTS5 keyword search + embedding similarity search.
|
||||
|
||||
@@ -144,7 +158,7 @@ class HybridSearch:
|
||||
self._ollama_available = False
|
||||
return []
|
||||
|
||||
all_embeddings = self.db.all_embeddings()
|
||||
all_embeddings = self._get_all_embeddings()
|
||||
if not all_embeddings:
|
||||
return []
|
||||
|
||||
|
||||
@@ -7,11 +7,176 @@ ready for JSON serialization or Jinja2 template rendering.
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TypedDict
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.preprocessing import normalize as sk_normalize
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TypedDicts for common return shapes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class OverviewStats(TypedDict):
|
||||
"""High-level dashboard statistics from :func:`get_overview_stats`."""
|
||||
total_drafts: int
|
||||
rated_count: int
|
||||
author_count: int
|
||||
idea_count: int
|
||||
gap_count: int
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
false_positive_count: int
|
||||
|
||||
|
||||
class DraftListItem(TypedDict):
|
||||
"""Single draft in the paginated listing from :func:`get_drafts_page`."""
|
||||
name: str
|
||||
title: str
|
||||
date: str | None
|
||||
url: str
|
||||
pages: int
|
||||
group: str
|
||||
source: str
|
||||
score: float
|
||||
novelty: float
|
||||
maturity: float
|
||||
overlap: float
|
||||
momentum: float
|
||||
relevance: float
|
||||
categories: list[str]
|
||||
summary: str
|
||||
readiness: float
|
||||
|
||||
|
||||
class DraftsPage(TypedDict):
|
||||
"""Paginated draft listing from :func:`get_drafts_page`."""
|
||||
drafts: list[DraftListItem]
|
||||
total: int
|
||||
page: int
|
||||
per_page: int
|
||||
pages: int
|
||||
|
||||
|
||||
class AuthorInfo(TypedDict):
|
||||
"""Author entry from :func:`get_top_authors`."""
|
||||
name: str
|
||||
affiliation: str
|
||||
draft_count: int
|
||||
drafts: list[str]
|
||||
|
||||
|
||||
class AuthorNetworkNode(TypedDict):
|
||||
"""Node in the author network graph."""
|
||||
id: str
|
||||
name: str
|
||||
org: str
|
||||
draft_count: int
|
||||
avg_score: float
|
||||
drafts: list[str]
|
||||
|
||||
|
||||
class AuthorNetworkEdge(TypedDict):
|
||||
"""Edge in the author network graph."""
|
||||
source: str
|
||||
target: str
|
||||
weight: int
|
||||
|
||||
|
||||
class AuthorCluster(TypedDict):
|
||||
"""Cluster in the author network."""
|
||||
id: int
|
||||
members: list[str]
|
||||
org_mix: dict[str, int]
|
||||
size: int
|
||||
drafts: list[dict[str, str]]
|
||||
draft_count: int
|
||||
|
||||
|
||||
class AuthorNetwork(TypedDict):
|
||||
"""Full author network from :func:`get_author_network_full`."""
|
||||
nodes: list[AuthorNetworkNode]
|
||||
edges: list[AuthorNetworkEdge]
|
||||
clusters: list[AuthorCluster]
|
||||
|
||||
|
||||
class SimilarityGraphStats(TypedDict):
|
||||
"""Stats sub-dict in similarity graph."""
|
||||
node_count: int
|
||||
edge_count: int
|
||||
avg_similarity: float
|
||||
|
||||
|
||||
class SimilarityGraph(TypedDict):
|
||||
"""Draft similarity network from :func:`get_similarity_graph`."""
|
||||
nodes: list[dict]
|
||||
edges: list[dict]
|
||||
stats: SimilarityGraphStats
|
||||
|
||||
|
||||
class TimelineData(TypedDict):
|
||||
"""Monthly category counts from :func:`get_timeline_data`."""
|
||||
months: list[str]
|
||||
series: dict[str, list[int]]
|
||||
categories: list[str]
|
||||
|
||||
|
||||
class MonitorCost(TypedDict):
|
||||
"""Cost sub-dict in monitor status."""
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
estimated_usd: float
|
||||
|
||||
|
||||
class MonitorPipeline(TypedDict):
|
||||
"""Pipeline sub-dict in monitor status."""
|
||||
total_drafts: int
|
||||
rated: int
|
||||
embedded: int
|
||||
with_ideas: int
|
||||
idea_total: int
|
||||
gap_count: int
|
||||
|
||||
|
||||
class MonitorStatus(TypedDict):
|
||||
"""Monitor status from :func:`get_monitor_status`."""
|
||||
last_run: dict | None
|
||||
runs: list[dict]
|
||||
unprocessed: dict[str, int]
|
||||
total_runs: int
|
||||
pipeline: MonitorPipeline
|
||||
cost: MonitorCost
|
||||
|
||||
|
||||
class SearchResults(TypedDict):
|
||||
"""Global search results from :func:`global_search`."""
|
||||
drafts: list[dict]
|
||||
ideas: list[dict]
|
||||
authors: list[dict]
|
||||
gaps: list[dict]
|
||||
|
||||
|
||||
class CitationGraphStats(TypedDict):
|
||||
"""Stats sub-dict in citation graph."""
|
||||
node_count: int
|
||||
edge_count: int
|
||||
rfc_count: int
|
||||
draft_count: int
|
||||
|
||||
|
||||
class CitationGraph(TypedDict):
|
||||
"""Citation network from :func:`get_citation_graph`."""
|
||||
nodes: list[dict]
|
||||
edges: list[dict]
|
||||
stats: CitationGraphStats
|
||||
|
||||
# Add project root to path so we can import ietf_analyzer
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent
|
||||
@@ -20,6 +185,8 @@ if str(_project_root) not in sys.path:
|
||||
|
||||
from ietf_analyzer.config import Config
|
||||
from ietf_analyzer.db import Database
|
||||
from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch
|
||||
from ietf_analyzer.search import HybridSearch
|
||||
|
||||
def _extract_month(time_str: str | None) -> str:
|
||||
"""Normalize a date string to YYYY-MM format."""
|
||||
@@ -55,7 +222,7 @@ def get_db() -> Database:
|
||||
return Database(config)
|
||||
|
||||
|
||||
def get_overview_stats(db: Database) -> dict:
|
||||
def get_overview_stats(db: Database) -> OverviewStats:
|
||||
"""Return high-level stats for the dashboard home page.
|
||||
|
||||
Excludes drafts flagged as false positives from rated counts.
|
||||
@@ -204,7 +371,7 @@ def get_drafts_page(
|
||||
sort: str = "score",
|
||||
sort_dir: str = "desc",
|
||||
source: str = "",
|
||||
) -> dict:
|
||||
) -> DraftsPage:
|
||||
"""Return a paginated, filtered list of drafts with ratings.
|
||||
|
||||
Returns dict with keys: drafts, total, page, per_page, pages.
|
||||
@@ -262,11 +429,9 @@ def get_drafts_page(
|
||||
start = (page - 1) * per_page
|
||||
page_items = filtered[start : start + per_page]
|
||||
|
||||
# Pre-compute readiness for page items (lightweight version)
|
||||
from ietf_analyzer.readiness import compute_readiness
|
||||
readiness_cache = {}
|
||||
for draft, rating in page_items:
|
||||
readiness_cache[draft.name] = compute_readiness(db, draft.name)
|
||||
# Pre-compute readiness in batch (~6 queries total instead of ~200)
|
||||
|
||||
readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items])
|
||||
|
||||
drafts = []
|
||||
for draft, rating in page_items:
|
||||
@@ -350,7 +515,7 @@ def get_draft_detail(db: Database, name: str) -> dict | None:
|
||||
}
|
||||
|
||||
# Readiness score
|
||||
from ietf_analyzer.readiness import compute_readiness
|
||||
|
||||
result["readiness"] = compute_readiness(db, name)
|
||||
|
||||
# Annotation
|
||||
@@ -387,7 +552,7 @@ def get_rating_distributions(db: Database) -> dict:
|
||||
return dims
|
||||
|
||||
|
||||
def get_timeline_data(db: Database) -> dict:
|
||||
def get_timeline_data(db: Database) -> TimelineData:
|
||||
"""Return monthly counts by category for timeline chart."""
|
||||
pairs = db.drafts_with_ratings(limit=1000)
|
||||
all_drafts = db.list_drafts(limit=1000, order_by="time ASC")
|
||||
@@ -482,7 +647,7 @@ def read_generated_draft(filename: str) -> str | None:
|
||||
return path.read_text(errors="replace")
|
||||
|
||||
|
||||
def get_top_authors(db: Database, limit: int = 30) -> list[dict]:
|
||||
def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]:
|
||||
"""Return top authors by draft count."""
|
||||
rows = db.top_authors(limit=limit)
|
||||
return [
|
||||
@@ -561,19 +726,19 @@ def get_coauthor_network(db: Database, min_shared: int = 1) -> dict:
|
||||
return {"nodes": nodes, "edges": edges}
|
||||
|
||||
|
||||
def get_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
|
||||
def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
|
||||
"""Return draft similarity network (cached)."""
|
||||
return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold))
|
||||
|
||||
|
||||
def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> dict:
|
||||
def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph:
|
||||
"""Return draft similarity network for force-directed graph.
|
||||
|
||||
Returns {nodes: [{name, title, category, score}],
|
||||
edges: [{source, target, similarity}],
|
||||
stats: {node_count, edge_count, avg_similarity}}
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
embeddings = db.all_embeddings()
|
||||
if len(embeddings) < 2:
|
||||
@@ -639,12 +804,12 @@ def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]:
|
||||
]
|
||||
|
||||
|
||||
def get_author_network_full(db: Database) -> dict:
|
||||
def get_author_network_full(db: Database) -> AuthorNetwork:
|
||||
"""Return author network (cached for 5 min)."""
|
||||
return _cached("author_network", lambda: _compute_author_network_full(db))
|
||||
|
||||
|
||||
def _compute_author_network_full(db: Database) -> dict:
|
||||
def _compute_author_network_full(db: Database) -> AuthorNetwork:
|
||||
"""Return enriched co-authorship network with avg scores and cluster info.
|
||||
|
||||
Returns {
|
||||
@@ -704,6 +869,12 @@ def _compute_author_network_full(db: Database) -> dict:
|
||||
visited: set[str] = set()
|
||||
clusters = []
|
||||
|
||||
# Batch-load all drafts referenced by authors (avoid N+1 in cluster loop)
|
||||
_all_dn = set()
|
||||
for _ai in author_info.values():
|
||||
_all_dn.update(_ai.get("drafts", []))
|
||||
_all_drafts_map = db.get_drafts_by_names(list(_all_dn))
|
||||
|
||||
for node in sorted(node_set):
|
||||
if node in visited:
|
||||
continue
|
||||
@@ -728,7 +899,7 @@ def _compute_author_network_full(db: Database) -> dict:
|
||||
org_mix[org] += 1
|
||||
for dn in author_info.get(m, {}).get("drafts", []):
|
||||
if dn not in cluster_drafts:
|
||||
d = db.get_draft(dn)
|
||||
d = _all_drafts_map.get(dn)
|
||||
cluster_drafts[dn] = d.title[:80] if d else dn
|
||||
clusters.append({
|
||||
"id": len(clusters),
|
||||
@@ -756,9 +927,7 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
a target of ~30 clusters for readable groupings. Enriches each cluster
|
||||
with WG info and category breakdown.
|
||||
"""
|
||||
import json as _json
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import normalize as sk_normalize
|
||||
|
||||
|
||||
embeddings = db.all_idea_embeddings()
|
||||
if not embeddings:
|
||||
@@ -777,8 +946,8 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
draft_cats: dict[str, list[str]] = {}
|
||||
for r in rating_rows:
|
||||
try:
|
||||
draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
|
||||
except (_json.JSONDecodeError, TypeError):
|
||||
draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else []
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
draft_cats[r["draft_name"]] = []
|
||||
|
||||
# Build matrix from embeddings that have matching ideas
|
||||
@@ -792,7 +961,6 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
# Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
|
||||
n_target = max(10, min(40, len(idea_ids) // 12))
|
||||
try:
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
|
||||
labels = clustering.fit_predict(matrix_norm)
|
||||
except Exception:
|
||||
@@ -877,7 +1045,6 @@ def _compute_idea_clusters(db: Database) -> dict:
|
||||
# t-SNE for scatter
|
||||
scatter = []
|
||||
try:
|
||||
from sklearn.manifold import TSNE
|
||||
perp = min(30, len(idea_ids) - 1)
|
||||
tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
|
||||
coords = tsne.fit_transform(matrix_norm)
|
||||
@@ -917,7 +1084,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
|
||||
animation frames. Each point carries a ``month`` field (YYYY-MM) so the
|
||||
front-end can build cumulative animation frames.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
embeddings = db.all_embeddings()
|
||||
if len(embeddings) < 5:
|
||||
@@ -935,7 +1102,6 @@ def _compute_timeline_animation_data(db: Database) -> dict:
|
||||
matrix = np.array([embeddings[n] for n in names])
|
||||
|
||||
try:
|
||||
from sklearn.manifold import TSNE
|
||||
tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
|
||||
random_state=42, max_iter=500)
|
||||
coords = tsne.fit_transform(matrix)
|
||||
@@ -975,7 +1141,7 @@ def _compute_timeline_animation_data(db: Database) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def get_monitor_status(db: Database) -> dict:
|
||||
def get_monitor_status(db: Database) -> MonitorStatus:
|
||||
"""Return monitoring status data for dashboard."""
|
||||
runs = db.get_monitor_runs(limit=20)
|
||||
last = runs[0] if runs else None
|
||||
@@ -1014,12 +1180,12 @@ def get_monitor_status(db: Database) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def get_citation_graph(db: Database, min_refs: int = 2) -> dict:
|
||||
def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
|
||||
"""Return citation graph (cached for 5 min)."""
|
||||
return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs))
|
||||
|
||||
|
||||
def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
|
||||
def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph:
|
||||
"""Return citation network data for force-directed graph.
|
||||
|
||||
Returns {nodes: [{id, type, title, influence, ...}],
|
||||
@@ -1131,7 +1297,7 @@ def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def global_search(db: Database, query: str) -> dict:
|
||||
def global_search(db: Database, query: str) -> SearchResults:
|
||||
"""Search across drafts (FTS5), ideas, authors, and gaps.
|
||||
|
||||
Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}.
|
||||
@@ -1144,7 +1310,6 @@ def global_search(db: Database, query: str) -> dict:
|
||||
|
||||
# 1. Drafts via FTS5
|
||||
try:
|
||||
import re
|
||||
fts_query = re.sub(r'[^\w\s]', '', q)
|
||||
fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE)
|
||||
fts_query = re.sub(r'\s+', ' ', fts_query).strip()
|
||||
@@ -1242,7 +1407,7 @@ def get_landscape_tsne(db: Database) -> list[dict]:
|
||||
|
||||
def _compute_landscape_tsne(db: Database) -> list[dict]:
|
||||
"""Compute t-SNE from embeddings, return [{name, title, x, y, category, score}]."""
|
||||
import numpy as np
|
||||
|
||||
|
||||
embeddings = db.all_embeddings()
|
||||
if len(embeddings) < 5:
|
||||
@@ -1260,7 +1425,6 @@ def _compute_landscape_tsne(db: Database) -> list[dict]:
|
||||
matrix = np.array([embeddings[n] for n in names])
|
||||
|
||||
try:
|
||||
from sklearn.manifold import TSNE
|
||||
tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1),
|
||||
random_state=42, max_iter=500)
|
||||
coords = tsne.fit_transform(matrix)
|
||||
@@ -1295,7 +1459,7 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
|
||||
comparison_text: str | None,
|
||||
}
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
drafts_data = []
|
||||
all_ideas: dict[str, list[dict]] = {}
|
||||
@@ -1384,9 +1548,6 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None:
|
||||
|
||||
def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
|
||||
"""Search-only (free) — returns sources + cached answer if available."""
|
||||
from ietf_analyzer.config import Config
|
||||
from ietf_analyzer.search import HybridSearch
|
||||
|
||||
config = Config.load()
|
||||
searcher = HybridSearch(config, db)
|
||||
return searcher.search_only(question, top_k=top_k)
|
||||
@@ -1394,9 +1555,6 @@ def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict:
|
||||
|
||||
def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict:
|
||||
"""Run Claude synthesis (costs tokens, result is cached permanently)."""
|
||||
from ietf_analyzer.config import Config
|
||||
from ietf_analyzer.search import HybridSearch
|
||||
|
||||
config = Config.load()
|
||||
searcher = HybridSearch(config, db)
|
||||
return searcher.ask(question, top_k=top_k, cheap=cheap)
|
||||
|
||||
Reference in New Issue
Block a user