diff --git a/src/webui/app.py b/src/webui/app.py index 9878cb6..0b2f565 100644 --- a/src/webui/app.py +++ b/src/webui/app.py @@ -5,935 +5,81 @@ Run with: python src/webui/app.py from __future__ import annotations +import logging +import os import sys +import time from pathlib import Path # Ensure project src is on path _project_root = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(_project_root / "src")) -import csv -import io -import json -import time -import functools -from collections import defaultdict +from flask import Flask, g, render_template, request -from flask import Flask, render_template, request, jsonify, abort, g, Response, redirect, url_for +from webui.auth import init_auth +from webui.analytics import init_analytics +from webui.blueprints import register_blueprints +from webui.data import get_db -from webui.auth import admin_required, init_auth -from webui.analytics import init_analytics, get_analytics_data -from webui.obsidian_export import build_obsidian_vault -from webui.data import ( - get_db, - get_overview_stats, - get_category_counts, - get_drafts_page, - get_draft_detail, - get_rating_distributions, - get_timeline_data, - get_ideas_by_type, - get_all_gaps, - get_gap_detail, - get_generated_drafts, - read_generated_draft, - get_top_authors, - get_org_data, - get_category_radar_data, - get_score_histogram, - get_coauthor_network, - get_cross_org_data, - get_landscape_tsne, - get_similarity_graph, - get_timeline_animation_data, - get_idea_clusters, - get_monitor_status, - get_author_network_full, - get_citation_graph, - get_comparison_data, - get_ask_search, - get_ask_synthesize, - get_category_summary, - global_search, - get_architecture, - get_source_comparison, - get_false_positive_profile, - get_citation_influence, - get_bcp_analysis, - get_trends_data, - get_complexity_data, - get_idea_analysis, - get_all_proposals, - get_proposal_detail, - get_proposals_for_gap, -) - -app = Flask( - __name__, - template_folder=str(Path(__file__).parent / "templates"), - static_folder=str(Path(__file__).parent / "static"), - static_url_path="/static", -) -import os -app.config["SECRET_KEY"] = os.environ.get("FLASK_SECRET_KEY", os.urandom(24).hex()) -# Auth is initialized at startup — see __main__ block and create_app() -# Default: production mode (admin disabled) -init_auth(app, dev=False) - -# Analytics (GDPR-compliant, no cookies) -_analytics_db = str(_project_root / "data" / "analytics.db") -init_analytics(app, db_path=_analytics_db) - - -# --- Rate limiting for Claude-calling endpoints --- - -_rate_limit_store: dict[str, list[float]] = defaultdict(list) -_RATE_LIMIT_MAX = 10 # max requests -_RATE_LIMIT_WINDOW = 60 # per 60 seconds - - -def rate_limit(f): - """Simple in-memory rate limiter: max 10 requests per minute per IP.""" - @functools.wraps(f) - def wrapper(*args, **kwargs): - ip = request.remote_addr or "unknown" - now = time.time() - # Prune timestamps outside the sliding window - timestamps = _rate_limit_store[ip] - _rate_limit_store[ip] = [t for t in timestamps if now - t < _RATE_LIMIT_WINDOW] - if len(_rate_limit_store[ip]) >= _RATE_LIMIT_MAX: - return jsonify({"error": "Rate limit exceeded. Try again later."}), 429 - _rate_limit_store[ip].append(now) - return f(*args, **kwargs) - return wrapper - - -# --- Database lifecycle (per-request to avoid SQLite threading issues) --- - - -def db(): - if "db" not in g: - g.db = get_db() - return g.db - - -@app.teardown_appcontext -def close_db(exception=None): - database = g.pop("db", None) - if database is not None: - database.close() - - -# --- Routes --- - - -@app.route("/") -def overview(): - stats = get_overview_stats(db()) - categories = get_category_counts(db()) - timeline = get_timeline_data(db()) - scores = get_score_histogram(db()) - radar = get_category_radar_data(db()) - return render_template( - "overview.html", - stats=stats, - categories=categories, - timeline=timeline, - scores=scores, - radar=radar, - ) - - -@app.route("/drafts") -def drafts(): - page = request.args.get("page", 1, type=int) - search = request.args.get("q", "") - category = request.args.get("cat", "") - source = request.args.get("source", "") - min_score = request.args.get("min_score", 0.0, type=float) - sort = request.args.get("sort", "score") - sort_dir = request.args.get("dir", "desc") - - result = get_drafts_page( - db(), - page=page, - search=search, - category=category, - min_score=min_score, - sort=sort, - sort_dir=sort_dir, - source=source, - ) - categories = get_category_counts(db()) - cat_summary = get_category_summary(db(), category) if category else None - return render_template( - "drafts.html", - result=result, - categories=categories, - cat_summary=cat_summary, - search=search, - current_cat=category, - current_source=source, - min_score=min_score, - sort=sort, - sort_dir=sort_dir, - ) - - -@app.route("/drafts/") -def draft_detail(name: str): - database = db() - detail = get_draft_detail(database, name) - if not detail: - abort(404) - # Build set of draft ref IDs that exist in our DB for internal linking - ref_draft_ids = [r["id"] for r in detail.get("refs", []) if r["type"] == "draft"] - known_drafts = set() - if ref_draft_ids: - placeholders = ",".join("?" * len(ref_draft_ids)) - rows = database.conn.execute( - f"SELECT name FROM drafts WHERE name IN ({placeholders})", ref_draft_ids - ).fetchall() - known_drafts = {r["name"] for r in rows} - return render_template("draft_detail.html", draft=detail, known_drafts=known_drafts) - - -@app.route("/ideas") -def ideas(): - data = get_ideas_by_type(db()) - return render_template("ideas.html", data=data) - - -@app.route("/gaps") -@admin_required -def gaps(): - gap_list = get_all_gaps(db()) - generated = get_generated_drafts() - return render_template("gaps.html", gaps=gap_list, generated_drafts=generated) - - -@app.route("/gaps/demo") -@admin_required -def gaps_demo(): - """Show a pre-generated example draft so users can see output without API calls.""" - generated = get_generated_drafts() - # Default to the first generated draft, or allow selection via query param - selected = request.args.get("file", "") - draft_text = None - draft_info = None - if selected: - draft_text = read_generated_draft(selected) - for gd in generated: - if gd["filename"] == selected: - draft_info = gd - break - elif generated: - draft_info = generated[0] - draft_text = read_generated_draft(draft_info["filename"]) - return render_template( - "gap_demo.html", - generated_drafts=generated, - draft_text=draft_text, - draft_info=draft_info, - selected=selected, - ) - - -@app.route("/gaps/") -@admin_required -def gap_detail(gap_id: int): - gap = get_gap_detail(db(), gap_id) - if not gap: - abort(404) - generated = get_generated_drafts() - gap_proposals = get_proposals_for_gap(db(), gap_id) - return render_template("gap_detail.html", gap=gap, generated_drafts=generated, proposals=gap_proposals) - - -@app.route("/gaps//generate", methods=["POST"]) -@admin_required -def gap_generate(gap_id: int): - """Trigger draft generation for a gap. Returns JSON with the generated text.""" - gap = get_gap_detail(db(), gap_id) - if not gap: - return jsonify({"error": "Gap not found"}), 404 - - try: - from ietf_analyzer.config import Config - from ietf_analyzer.analyzer import Analyzer - from ietf_analyzer.draftgen import DraftGenerator - - cfg = Config.load() - database = db() - analyzer = Analyzer(cfg, database) - generator = DraftGenerator(cfg, database, analyzer) - - # Generate into a file named after the gap - slug = gap["topic"].lower().replace(" ", "-")[:40] - output_path = str(Path(_project_root) / "data" / "reports" / "generated-drafts" / f"draft-gap-{gap_id}-{slug}.txt") - path = generator.generate(gap["topic"], output_path=output_path) - draft_text = Path(path).read_text(errors="replace") - - return jsonify({ - "success": True, - "text": draft_text, - "filename": Path(path).name, - "path": path, - }) - except Exception as e: - return jsonify({"error": str(e)}), 500 - - -@app.route("/ratings") -def ratings(): - distributions = get_rating_distributions(db()) - radar = get_category_radar_data(db()) - return render_template( - "ratings.html", - dist=distributions, - radar=radar, - ) - - -@app.route("/landscape") -@admin_required -def landscape(): - distributions = get_rating_distributions(db()) - tsne_data = get_landscape_tsne(db()) - return render_template( - "landscape.html", - dist=distributions, - tsne_data=tsne_data, - ) - - -@app.route("/timeline") -def timeline_animation(): - data = get_timeline_animation_data(db()) - return render_template("timeline.html", animation=data) - - -@app.route("/idea-clusters") -def idea_clusters(): - data = get_idea_clusters(db()) - return render_template("idea_clusters.html", clusters=data) - - -@app.route("/architecture") -def architecture(): - data = get_architecture(db()) - return render_template("architecture.html", arch=data) - - -@app.route("/api/architecture") -def api_architecture(): - return jsonify(get_architecture(db())) - - -@app.route("/similarity") -@admin_required -def similarity(): - network = get_similarity_graph(db()) - return render_template("similarity.html", network=network) - - -@app.route("/authors") -def authors(): - top = get_top_authors(db(), limit=50) - orgs = get_org_data(db(), limit=20) - network = get_author_network_full(db()) - cross_org = get_cross_org_data(db(), limit=20) - return render_template( - "authors.html", - authors=top, - orgs=orgs, - orgs_data=orgs, - network=network, - cross_org=cross_org, - ) - - -@app.route("/citations") -def citations(): - from webui.auth import is_admin as check_admin - graph = get_citation_graph(db()) - influence = get_citation_influence(db()) if check_admin() else None - bcp = get_bcp_analysis(db()) if check_admin() else None - return render_template("citations.html", graph=graph, influence=influence, bcp=bcp) - - -@app.route("/monitor") -@admin_required -def monitor_page(): - status = get_monitor_status(db()) - return render_template("monitor.html", status=status) - - -@app.route("/admin/analytics") -@admin_required -def analytics_dashboard(): - data = get_analytics_data(_analytics_db) - return render_template("analytics.html", data=data) - - -@app.route("/about") -def about(): - from ietf_analyzer.config import Config - cfg = Config.load() - stats = get_overview_stats(db()) - return render_template("about.html", stats=stats, search_keywords=cfg.search_keywords, - fetch_since=cfg.fetch_since) - - -@app.route("/impressum") -def impressum(): - return render_template("impressum.html") - - -@app.route("/datenschutz") -def datenschutz(): - return render_template("datenschutz.html") - - -@app.route("/search") -def search(): - q = request.args.get("q", "").strip() - results = global_search(db(), q) if q else {"drafts": [], "ideas": [], "authors": [], "gaps": []} - total = sum(len(v) for v in results.values()) - return render_template("search_results.html", query=q, results=results, total=total) - - -@app.route("/ask") -def ask_page(): - question = request.args.get("q", "") - result = None - if question: - top_k = request.args.get("top", 5, type=int) - # Search only (free) — returns sources + cached answer if available - result = get_ask_search(db(), question, top_k=top_k) - return render_template("ask.html", question=question, result=result) - - -@app.route("/api/ask/synthesize", methods=["POST"]) -@admin_required -@rate_limit -def api_ask_synthesize(): - """Synthesize an answer via Claude (costs tokens, cached permanently). Returns JSON.""" - data = request.get_json(force=True, silent=True) - if not data or "question" not in data: - return jsonify({"error": "Missing 'question' in request body"}), 400 - question = data["question"] - top_k = data.get("top_k", 5) - result = get_ask_synthesize(db(), question, top_k=top_k, cheap=True) - return jsonify(result) - - -@app.route("/api/ask", methods=["POST"]) -def api_ask(): - """Search only (free). Returns JSON with sources + cached answer if available.""" - data = request.get_json(force=True, silent=True) - if not data or "question" not in data: - return jsonify({"error": "Missing 'question' in request body"}), 400 - question = data["question"] - top_k = data.get("top_k", 5) - result = get_ask_search(db(), question, top_k=top_k) - return jsonify(result) - - -@app.route("/compare") -@admin_required -def compare_page(): - draft_names = request.args.get("drafts", "") - names = [n.strip() for n in draft_names.split(",") if n.strip()] if draft_names else [] - data = None - if len(names) >= 2: - data = get_comparison_data(db(), names) - return render_template("comparison.html", names=names, data=data) - - -@app.route("/api/compare", methods=["POST"]) -@admin_required -@rate_limit -def api_compare(): - """Run Claude comparison for drafts. Returns JSON with comparison text.""" - req_data = request.get_json(force=True, silent=True) - if not req_data or "drafts" not in req_data: - return jsonify({"error": "Missing 'drafts' in request body"}), 400 - - names = req_data["drafts"] - if len(names) < 2: - return jsonify({"error": "Need at least 2 drafts to compare"}), 400 - - try: - from ietf_analyzer.config import Config - from ietf_analyzer.analyzer import Analyzer - - cfg = Config.load() - database = db() - analyzer = Analyzer(cfg, database) - result = analyzer.compare_drafts(names) - return jsonify(result) - except Exception as e: - return jsonify({"error": str(e)}), 500 - - -# --- API endpoints for AJAX (used by client-side charts) --- - - -def _to_csv_response(rows: list[dict], filename: str = "export.csv") -> Response: - """Convert a list of dicts to a CSV download response.""" - if not rows: - return Response("", mimetype="text/csv", - headers={"Content-Disposition": f"attachment; filename={filename}"}) - si = io.StringIO() - writer = csv.DictWriter(si, fieldnames=rows[0].keys()) - writer.writeheader() - for row in rows: - # Flatten any list/dict values to JSON strings - flat = {} - for k, v in row.items(): - if isinstance(v, (list, dict)): - flat[k] = json.dumps(v) - else: - flat[k] = v - writer.writerow(flat) - return Response(si.getvalue(), mimetype="text/csv", - headers={"Content-Disposition": f"attachment; filename={filename}"}) - - -def _results_to_csv(results: dict) -> Response: - """Convert global search results (multi-category) to a single CSV.""" - rows = [] - for category, items in results.items(): - for item in items: - row = {"_category": category} - row.update(item) - rows.append(row) - return _to_csv_response(rows, "search_results.csv") - - -@app.route("/api/drafts") -def api_drafts(): - page = request.args.get("page", 1, type=int) - search = request.args.get("q", "") - category = request.args.get("cat", "") - source = request.args.get("source", "") - min_score = request.args.get("min_score", 0.0, type=float) - sort = request.args.get("sort", "score") - sort_dir = request.args.get("dir", "desc") - data = get_drafts_page(db(), page=page, search=search, category=category, - min_score=min_score, sort=sort, sort_dir=sort_dir, - source=source) - if request.args.get("format") == "csv": - return _to_csv_response(data.get("drafts", []), "drafts.csv") - return jsonify(data) - - -@app.route("/api/stats") -def api_stats(): - return jsonify(get_overview_stats(db())) - - -@app.route("/api/authors/network") -def api_author_network(): - return jsonify(get_author_network_full(db())) - - -@app.route("/api/citations") -def api_citations(): - min_refs = request.args.get("min_refs", 2, type=int) - return jsonify(get_citation_graph(db(), min_refs=min_refs)) - - -@app.route("/api/search") -def api_search(): - q = request.args.get("q", "").strip() - results = global_search(db(), q) if q else {"drafts": [], "ideas": [], "authors": [], "gaps": []} - if request.args.get("format") == "csv": - return _results_to_csv(results) - return jsonify(results) - - -@app.route("/api/ideas") -def api_ideas(): - data = get_ideas_by_type(db()) - if request.args.get("format") == "csv": - return _to_csv_response(data.get("ideas", []), "ideas.csv") - return jsonify(data) - - -@app.route("/api/gaps") -@admin_required -def api_gaps(): - data = get_all_gaps(db()) - if request.args.get("format") == "csv": - return _to_csv_response(data, "gaps.csv") - return jsonify(data) - - -@app.route("/api/gaps/") -@admin_required -def api_gap_detail(gap_id: int): - gap = get_gap_detail(db(), gap_id) - if not gap: - return jsonify({"error": "Gap not found"}), 404 - return jsonify(gap) - - -@app.route("/api/ratings") -def api_ratings(): - data = get_rating_distributions(db()) - if request.args.get("format") == "csv": - # Transpose columnar data to rows - rows = [] - for i in range(len(data.get("names", []))): - rows.append({ - "name": data["names"][i], - "score": data["scores"][i], - "novelty": data["novelty"][i], - "maturity": data["maturity"][i], - "overlap": data["overlap"][i], - "momentum": data["momentum"][i], - "relevance": data["relevance"][i], - "category": data["categories"][i], - }) - return _to_csv_response(rows, "ratings.csv") - return jsonify(data) - - -@app.route("/api/timeline") -def api_timeline(): - data = get_timeline_data(db()) - return jsonify(data) - - -@app.route("/api/landscape") -@admin_required -def api_landscape(): - data = get_landscape_tsne(db()) - if request.args.get("format") == "csv": - return _to_csv_response(data, "landscape.csv") - return jsonify(data) - - -@app.route("/api/similarity") -@admin_required -def api_similarity(): - data = get_similarity_graph(db()) - return jsonify(data) - - -@app.route("/api/idea-clusters") -def api_idea_clusters(): - data = get_idea_clusters(db()) - return jsonify(data) - - -@app.route("/api/monitor") -@admin_required -def api_monitor(): - data = get_monitor_status(db()) - return jsonify(data) - - -@app.route("/api/drafts/") -def api_draft_detail(name: str): - detail = get_draft_detail(db(), name) - if not detail: - return jsonify({"error": "Draft not found"}), 404 - return jsonify(detail) - - -@app.route("/api/categories") -def api_categories(): - data = get_category_counts(db()) - if request.args.get("format") == "csv": - rows = [{"category": k, "count": v} for k, v in data.items()] - return _to_csv_response(rows, "categories.csv") - return jsonify(data) - - -@app.route("/api/drafts//annotate", methods=["POST"]) -@admin_required -def api_annotate(name: str): - """Add or update annotation for a draft.""" - import json as _json - database = db() - draft = database.get_draft(name) - if not draft: - return jsonify({"error": "Draft not found"}), 404 - - data = request.get_json(force=True, silent=True) - if not data: - return jsonify({"error": "Invalid JSON body"}), 400 - - note = data.get("note") - tags = data.get("tags") - add_tag = data.get("add_tag") - remove_tag = data.get("remove_tag") - - # Handle add/remove tag operations - if add_tag or remove_tag: - existing = database.get_annotation(name) - current_tags = existing["tags"] if existing else [] - if add_tag and add_tag not in current_tags: - current_tags.append(add_tag) - if remove_tag and remove_tag in current_tags: - current_tags.remove(remove_tag) - tags = current_tags - - database.upsert_annotation(name, note=note, tags=tags) - annotation = database.get_annotation(name) - return jsonify({"success": True, "annotation": annotation}) - - -@app.route("/export/obsidian") -@admin_required -def export_obsidian(): - """Download the entire research corpus as an Obsidian vault (ZIP).""" - data = build_obsidian_vault(db()) - return Response( - data, - mimetype="application/zip", - headers={"Content-Disposition": "attachment; filename=IETF-AI-Agent-Drafts.zip"}, - ) +# --- App factory --- def create_app(dev: bool = False) -> Flask: - """Re-initialize auth mode. Call before run() if needed.""" - init_auth(app, dev=dev) - return app + """Create and configure the Flask application.""" + application = Flask( + __name__, + template_folder=str(Path(__file__).parent / "templates"), + static_folder=str(Path(__file__).parent / "static"), + static_url_path="/static", + ) + application.config["SECRET_KEY"] = os.environ.get("FLASK_SECRET_KEY", os.urandom(24).hex()) + + # Auth + init_auth(application, dev=dev) + + # Analytics (GDPR-compliant, no cookies) + analytics_db = str(_project_root / "data" / "analytics.db") + init_analytics(application, db_path=analytics_db) + + # Register blueprints + register_blueprints(application) + + # Database lifecycle (per-request) + @application.teardown_appcontext + def close_db(exception=None): + database = g.pop("db", None) + if database is not None: + database.close() + + # Error handlers + @application.errorhandler(404) + def not_found(e): + return render_template("errors/404.html"), 404 + + @application.errorhandler(500) + def server_error(e): + return render_template("errors/500.html"), 500 + + # Request timing + @application.before_request + def start_timer(): + g.start_time = time.time() + + @application.after_request + def log_request(response): + if hasattr(g, "start_time"): + duration = (time.time() - g.start_time) * 1000 + logger = logging.getLogger("webui") + logger.info("%s %s %s %.1fms", request.method, request.path, + response.status_code, duration) + return response + + return application -# ── Sources & False Positives ──────────────────────────────────────────── - - -@app.route("/sources") -@admin_required -def sources_page(): - data = get_source_comparison(db()) - return render_template("sources.html", data=data) - - -@app.route("/false-positives") -@admin_required -def false_positives_page(): - data = get_false_positive_profile(db()) - return render_template("false_positives.html", data=data) - - -@app.route("/api/sources") -@admin_required -def api_sources(): - data = get_source_comparison(db()) - return jsonify(data) - - -@app.route("/api/false-positives") -@admin_required -def api_false_positives(): - data = get_false_positive_profile(db()) - return jsonify(data) - - -# ── Citation Influence & BCP ───────────────────────────────────────────── - - -@app.route("/api/citations/influence") -@admin_required -def api_citation_influence(): - return jsonify(get_citation_influence(db())) - - -@app.route("/api/citations/bcp") -@admin_required -def api_bcp_analysis(): - return jsonify(get_bcp_analysis(db())) - - -# ── Idea Analysis ──────────────────────────────────────────────────────── - - -@app.route("/idea-analysis") -@admin_required -def idea_analysis(): - data = get_idea_analysis(db()) - return render_template("idea_analysis.html", data=data) - - -@app.route("/api/idea-analysis") -@admin_required -def api_idea_analysis(): - data = get_idea_analysis(db()) - return jsonify(data) - - -# ── Trends & Complexity ────────────────────────────────────────────────── - - -@app.route("/trends") -@admin_required -def trends(): - data = get_trends_data(db()) - return render_template("trends_analysis.html", data=data) - - -@app.route("/complexity") -@admin_required -def complexity(): - data = get_complexity_data(db()) - return render_template("complexity.html", data=data) - - -@app.route("/api/trends") -@admin_required -def api_trends(): - return jsonify(get_trends_data(db())) - - -@app.route("/api/complexity") -@admin_required -def api_complexity(): - return jsonify(get_complexity_data(db())) - - -# ── Proposals (dev-only) ──────────────────────────────────────────────── - - -@app.route("/proposals") -@admin_required -def proposals(): - proposal_list = get_all_proposals(db()) - gap_list = get_all_gaps(db()) - return render_template("proposals.html", proposals=proposal_list, gaps=gap_list) - - -@app.route("/proposals/new", methods=["GET", "POST"]) -@admin_required -def proposal_new(): - if request.method == "POST": - data = request.form - slug = data.get("slug", "").strip() - if not slug: - import re - slug = re.sub(r'[^a-z0-9]+', '-', data["title"].lower()).strip('-') - gap_ids = [int(g) for g in request.form.getlist("gap_ids") if g] - proposal = { - "title": data["title"], - "slug": slug, - "status": data.get("status", "idea"), - "description": data.get("description", ""), - "content_md": data.get("content_md", ""), - "source_paper": data.get("source_paper", ""), - "source_url": data.get("source_url", ""), - "intended_wg": data.get("intended_wg", ""), - "draft_name": data.get("draft_name", ""), - "gap_ids": gap_ids, - } - pid = db().upsert_proposal(proposal) - return redirect(url_for("proposal_detail", proposal_id=pid)) - gap_list = get_all_gaps(db()) - return render_template("proposal_edit.html", proposal=None, gaps=gap_list) - - -@app.route("/proposals/") -@admin_required -def proposal_detail(proposal_id): - proposal = get_proposal_detail(db(), proposal_id) - if not proposal: - abort(404) - return render_template("proposal_detail.html", proposal=proposal) - - -@app.route("/proposals//edit", methods=["GET", "POST"]) -@admin_required -def proposal_edit(proposal_id): - if request.method == "POST": - data = request.form - slug = data.get("slug", "").strip() - if not slug: - import re - slug = re.sub(r'[^a-z0-9]+', '-', data["title"].lower()).strip('-') - gap_ids = [int(g) for g in request.form.getlist("gap_ids") if g] - proposal = { - "id": proposal_id, - "title": data["title"], - "slug": slug, - "status": data.get("status", "idea"), - "description": data.get("description", ""), - "content_md": data.get("content_md", ""), - "source_paper": data.get("source_paper", ""), - "source_url": data.get("source_url", ""), - "intended_wg": data.get("intended_wg", ""), - "draft_name": data.get("draft_name", ""), - "gap_ids": gap_ids, - } - db().upsert_proposal(proposal) - return redirect(url_for("proposal_detail", proposal_id=proposal_id)) - proposal = get_proposal_detail(db(), proposal_id) - if not proposal: - abort(404) - gap_list = get_all_gaps(db()) - return render_template("proposal_edit.html", proposal=proposal, gaps=gap_list) - - -@app.route("/proposals//delete", methods=["POST"]) -@admin_required -def proposal_delete(proposal_id): - db().delete_proposal(proposal_id) - return redirect(url_for("proposals")) - - -@app.route("/api/proposals") -@admin_required -def api_proposals(): - data = get_all_proposals(db()) - return jsonify(data) - - -@app.route("/api/proposals/") -@admin_required -def api_proposal_detail(proposal_id): - p = get_proposal_detail(db(), proposal_id) - if not p: - return jsonify({"error": "Proposal not found"}), 404 - return jsonify(p) - - -@app.route("/proposals/intake", methods=["GET", "POST"]) -@admin_required -def proposal_intake(): - """Paste text/URLs → Claude generates proposals automatically.""" - if request.method == "POST": - raw_input = request.form.get("input_text", "").strip() - if not raw_input: - return jsonify({"error": "No input provided"}), 400 - - try: - from ietf_analyzer.config import Config - from ietf_analyzer.proposal_intake import ProposalIntake - - cfg = Config.load() - intake = ProposalIntake(cfg, db()) - proposals, usage = intake.process(raw_input, cheap=True) - - return jsonify({ - "success": True, - "count": len(proposals), - "proposals": [ - {"id": p.get("id"), "title": p.get("title"), "slug": p.get("slug"), - "gap_ids": p.get("gap_ids", []), "description": p.get("description", ""), - "content_md": p.get("content_md", ""), - "intended_wg": p.get("intended_wg", ""), "draft_name": p.get("draft_name", ""), - "source_paper": p.get("source_paper", ""), "source_url": p.get("source_url", "")} - for p in proposals - ], - "usage": usage, - }) - except Exception as e: - return jsonify({"error": str(e)}), 500 - - return render_template("proposal_intake.html") +# Module-level app instance for backward compatibility (import from webui.app import app) +app = create_app(dev=False) if __name__ == "__main__": @@ -946,7 +92,7 @@ if __name__ == "__main__": parser.add_argument("--port", type=int, default=5000) args = parser.parse_args() - init_auth(app, dev=args.dev) + app = create_app(dev=args.dev) mode = "\033[33mDEV\033[0m (admin enabled)" if args.dev else "\033[32mPRODUCTION\033[0m (admin disabled)" print(f"Starting IETF Draft Analyzer — {mode}") diff --git a/src/webui/blueprints/__init__.py b/src/webui/blueprints/__init__.py new file mode 100644 index 0000000..f7d42fb --- /dev/null +++ b/src/webui/blueprints/__init__.py @@ -0,0 +1,15 @@ +"""Flask blueprints for the IETF Draft Analyzer web UI.""" +from __future__ import annotations + +from flask import Flask + +from webui.blueprints.pages import pages_bp +from webui.blueprints.api import api_bp +from webui.blueprints.admin import admin_bp + + +def register_blueprints(app: Flask) -> None: + """Register all blueprints with the Flask app.""" + app.register_blueprint(pages_bp) + app.register_blueprint(api_bp) + app.register_blueprint(admin_bp) diff --git a/src/webui/blueprints/admin.py b/src/webui/blueprints/admin.py new file mode 100644 index 0000000..de02bf7 --- /dev/null +++ b/src/webui/blueprints/admin.py @@ -0,0 +1,562 @@ +"""Admin-only routes (require @admin_required).""" +from __future__ import annotations + +import functools +import time +from collections import defaultdict +from pathlib import Path + +from flask import Blueprint, render_template, request, jsonify, abort, g, Response, redirect, url_for + +from webui.auth import admin_required +from webui.analytics import get_analytics_data +from webui.obsidian_export import build_obsidian_vault +from webui.data import ( + get_db, + get_overview_stats, + get_rating_distributions, + get_all_gaps, + get_gap_detail, + get_generated_drafts, + read_generated_draft, + get_monitor_status, + get_landscape_tsne, + get_similarity_graph, + get_comparison_data, + get_ask_search, + get_ask_synthesize, + get_source_comparison, + get_false_positive_profile, + get_citation_influence, + get_bcp_analysis, + get_idea_analysis, + get_trends_data, + get_complexity_data, + get_all_proposals, + get_proposal_detail, + get_proposals_for_gap, +) + +admin_bp = Blueprint("admin", __name__) + +_project_root = Path(__file__).resolve().parent.parent.parent.parent + +# --- Rate limiting for Claude-calling endpoints --- + +_rate_limit_store: dict[str, list[float]] = defaultdict(list) +_RATE_LIMIT_MAX = 10 # max requests +_RATE_LIMIT_WINDOW = 60 # per 60 seconds + + +def rate_limit(f): + """Simple in-memory rate limiter: max 10 requests per minute per IP.""" + @functools.wraps(f) + def wrapper(*args, **kwargs): + ip = request.remote_addr or "unknown" + now = time.time() + timestamps = _rate_limit_store[ip] + _rate_limit_store[ip] = [t for t in timestamps if now - t < _RATE_LIMIT_WINDOW] + if len(_rate_limit_store[ip]) >= _RATE_LIMIT_MAX: + return jsonify({"error": "Rate limit exceeded. Try again later."}), 429 + _rate_limit_store[ip].append(now) + return f(*args, **kwargs) + return wrapper + + +def db(): + if "db" not in g: + g.db = get_db() + return g.db + + +# ── Gap pages ──────────────────────────────────────────────────────────── + +@admin_bp.route("/gaps") +@admin_required +def gaps(): + gap_list = get_all_gaps(db()) + generated = get_generated_drafts() + return render_template("gaps.html", gaps=gap_list, generated_drafts=generated) + + +@admin_bp.route("/gaps/demo") +@admin_required +def gaps_demo(): + """Show a pre-generated example draft so users can see output without API calls.""" + generated = get_generated_drafts() + selected = request.args.get("file", "") + draft_text = None + draft_info = None + if selected: + draft_text = read_generated_draft(selected) + for gd in generated: + if gd["filename"] == selected: + draft_info = gd + break + elif generated: + draft_info = generated[0] + draft_text = read_generated_draft(draft_info["filename"]) + return render_template( + "gap_demo.html", + generated_drafts=generated, + draft_text=draft_text, + draft_info=draft_info, + selected=selected, + ) + + +@admin_bp.route("/gaps/") +@admin_required +def gap_detail(gap_id: int): + gap = get_gap_detail(db(), gap_id) + if not gap: + abort(404) + generated = get_generated_drafts() + gap_proposals = get_proposals_for_gap(db(), gap_id) + return render_template("gap_detail.html", gap=gap, generated_drafts=generated, proposals=gap_proposals) + + +@admin_bp.route("/gaps//generate", methods=["POST"]) +@admin_required +def gap_generate(gap_id: int): + """Trigger draft generation for a gap. Returns JSON with the generated text.""" + gap = get_gap_detail(db(), gap_id) + if not gap: + return jsonify({"error": "Gap not found"}), 404 + + try: + from ietf_analyzer.config import Config + from ietf_analyzer.analyzer import Analyzer + from ietf_analyzer.draftgen import DraftGenerator + + cfg = Config.load() + database = db() + analyzer = Analyzer(cfg, database) + generator = DraftGenerator(cfg, database, analyzer) + + slug = gap["topic"].lower().replace(" ", "-")[:40] + output_path = str(Path(_project_root) / "data" / "reports" / "generated-drafts" / f"draft-gap-{gap_id}-{slug}.txt") + path = generator.generate(gap["topic"], output_path=output_path) + draft_text = Path(path).read_text(errors="replace") + + return jsonify({ + "success": True, + "text": draft_text, + "filename": Path(path).name, + "path": path, + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@admin_bp.route("/api/gaps") +@admin_required +def api_gaps(): + from webui.blueprints.api import _to_csv_response + data = get_all_gaps(db()) + if request.args.get("format") == "csv": + return _to_csv_response(data, "gaps.csv") + return jsonify(data) + + +@admin_bp.route("/api/gaps/") +@admin_required +def api_gap_detail(gap_id: int): + gap = get_gap_detail(db(), gap_id) + if not gap: + return jsonify({"error": "Gap not found"}), 404 + return jsonify(gap) + + +# ── Monitor ────────────────────────────────────────────────────────────── + +@admin_bp.route("/monitor") +@admin_required +def monitor_page(): + status = get_monitor_status(db()) + return render_template("monitor.html", status=status) + + +@admin_bp.route("/api/monitor") +@admin_required +def api_monitor(): + data = get_monitor_status(db()) + return jsonify(data) + + +# ── Analytics ──────────────────────────────────────────────────────────── + +@admin_bp.route("/admin/analytics") +@admin_required +def analytics_dashboard(): + analytics_db = str(_project_root / "data" / "analytics.db") + data = get_analytics_data(analytics_db) + return render_template("analytics.html", data=data) + + +# ── Landscape & Similarity ─────────────────────────────────────────────── + +@admin_bp.route("/landscape") +@admin_required +def landscape(): + distributions = get_rating_distributions(db()) + tsne_data = get_landscape_tsne(db()) + return render_template( + "landscape.html", + dist=distributions, + tsne_data=tsne_data, + ) + + +@admin_bp.route("/api/landscape") +@admin_required +def api_landscape(): + from webui.blueprints.api import _to_csv_response + data = get_landscape_tsne(db()) + if request.args.get("format") == "csv": + return _to_csv_response(data, "landscape.csv") + return jsonify(data) + + +@admin_bp.route("/similarity") +@admin_required +def similarity(): + network = get_similarity_graph(db()) + return render_template("similarity.html", network=network) + + +@admin_bp.route("/api/similarity") +@admin_required +def api_similarity(): + data = get_similarity_graph(db()) + return jsonify(data) + + +# ── Compare ────────────────────────────────────────────────────────────── + +@admin_bp.route("/compare") +@admin_required +def compare_page(): + draft_names = request.args.get("drafts", "") + names = [n.strip() for n in draft_names.split(",") if n.strip()] if draft_names else [] + data = None + if len(names) >= 2: + data = get_comparison_data(db(), names) + return render_template("comparison.html", names=names, data=data) + + +@admin_bp.route("/api/compare", methods=["POST"]) +@admin_required +@rate_limit +def api_compare(): + """Run Claude comparison for drafts. Returns JSON with comparison text.""" + req_data = request.get_json(force=True, silent=True) + if not req_data or "drafts" not in req_data: + return jsonify({"error": "Missing 'drafts' in request body"}), 400 + + names = req_data["drafts"] + if len(names) < 2: + return jsonify({"error": "Need at least 2 drafts to compare"}), 400 + + try: + from ietf_analyzer.config import Config + from ietf_analyzer.analyzer import Analyzer + + cfg = Config.load() + database = db() + analyzer = Analyzer(cfg, database) + result = analyzer.compare_drafts(names) + return jsonify(result) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +# ── Annotations ────────────────────────────────────────────────────────── + +@admin_bp.route("/api/drafts//annotate", methods=["POST"]) +@admin_required +def api_annotate(name: str): + """Add or update annotation for a draft.""" + import json as _json + database = db() + draft = database.get_draft(name) + if not draft: + return jsonify({"error": "Draft not found"}), 404 + + data = request.get_json(force=True, silent=True) + if not data: + return jsonify({"error": "Invalid JSON body"}), 400 + + note = data.get("note") + tags = data.get("tags") + add_tag = data.get("add_tag") + remove_tag = data.get("remove_tag") + + if add_tag or remove_tag: + existing = database.get_annotation(name) + current_tags = existing["tags"] if existing else [] + if add_tag and add_tag not in current_tags: + current_tags.append(add_tag) + if remove_tag and remove_tag in current_tags: + current_tags.remove(remove_tag) + tags = current_tags + + database.upsert_annotation(name, note=note, tags=tags) + annotation = database.get_annotation(name) + return jsonify({"success": True, "annotation": annotation}) + + +# ── Ask/Synthesize (Claude-powered) ────────────────────────────────────── + +@admin_bp.route("/api/ask/synthesize", methods=["POST"]) +@admin_required +@rate_limit +def api_ask_synthesize(): + """Synthesize an answer via Claude (costs tokens, cached permanently). Returns JSON.""" + data = request.get_json(force=True, silent=True) + if not data or "question" not in data: + return jsonify({"error": "Missing 'question' in request body"}), 400 + question = data["question"] + top_k = data.get("top_k", 5) + result = get_ask_synthesize(db(), question, top_k=top_k, cheap=True) + return jsonify(result) + + +# ── Sources & False Positives ──────────────────────────────────────────── + +@admin_bp.route("/sources") +@admin_required +def sources_page(): + data = get_source_comparison(db()) + return render_template("sources.html", data=data) + + +@admin_bp.route("/false-positives") +@admin_required +def false_positives_page(): + data = get_false_positive_profile(db()) + return render_template("false_positives.html", data=data) + + +@admin_bp.route("/api/sources") +@admin_required +def api_sources(): + data = get_source_comparison(db()) + return jsonify(data) + + +@admin_bp.route("/api/false-positives") +@admin_required +def api_false_positives(): + data = get_false_positive_profile(db()) + return jsonify(data) + + +# ── Citation Influence & BCP ───────────────────────────────────────────── + +@admin_bp.route("/api/citations/influence") +@admin_required +def api_citation_influence(): + return jsonify(get_citation_influence(db())) + + +@admin_bp.route("/api/citations/bcp") +@admin_required +def api_bcp_analysis(): + return jsonify(get_bcp_analysis(db())) + + +# ── Idea Analysis ──────────────────────────────────────────────────────── + +@admin_bp.route("/idea-analysis") +@admin_required +def idea_analysis(): + data = get_idea_analysis(db()) + return render_template("idea_analysis.html", data=data) + + +@admin_bp.route("/api/idea-analysis") +@admin_required +def api_idea_analysis(): + data = get_idea_analysis(db()) + return jsonify(data) + + +# ── Trends & Complexity ────────────────────────────────────────────────── + +@admin_bp.route("/trends") +@admin_required +def trends(): + data = get_trends_data(db()) + return render_template("trends_analysis.html", data=data) + + +@admin_bp.route("/complexity") +@admin_required +def complexity(): + data = get_complexity_data(db()) + return render_template("complexity.html", data=data) + + +@admin_bp.route("/api/trends") +@admin_required +def api_trends(): + return jsonify(get_trends_data(db())) + + +@admin_bp.route("/api/complexity") +@admin_required +def api_complexity(): + return jsonify(get_complexity_data(db())) + + +# ── Proposals ──────────────────────────────────────────────────────────── + +@admin_bp.route("/proposals") +@admin_required +def proposals(): + proposal_list = get_all_proposals(db()) + gap_list = get_all_gaps(db()) + return render_template("proposals.html", proposals=proposal_list, gaps=gap_list) + + +@admin_bp.route("/proposals/new", methods=["GET", "POST"]) +@admin_required +def proposal_new(): + if request.method == "POST": + data = request.form + slug = data.get("slug", "").strip() + if not slug: + import re + slug = re.sub(r'[^a-z0-9]+', '-', data["title"].lower()).strip('-') + gap_ids = [int(g_val) for g_val in request.form.getlist("gap_ids") if g_val] + proposal = { + "title": data["title"], + "slug": slug, + "status": data.get("status", "idea"), + "description": data.get("description", ""), + "content_md": data.get("content_md", ""), + "source_paper": data.get("source_paper", ""), + "source_url": data.get("source_url", ""), + "intended_wg": data.get("intended_wg", ""), + "draft_name": data.get("draft_name", ""), + "gap_ids": gap_ids, + } + pid = db().upsert_proposal(proposal) + return redirect(url_for("admin.proposal_detail", proposal_id=pid)) + gap_list = get_all_gaps(db()) + return render_template("proposal_edit.html", proposal=None, gaps=gap_list) + + +@admin_bp.route("/proposals/") +@admin_required +def proposal_detail(proposal_id): + proposal = get_proposal_detail(db(), proposal_id) + if not proposal: + abort(404) + return render_template("proposal_detail.html", proposal=proposal) + + +@admin_bp.route("/proposals//edit", methods=["GET", "POST"]) +@admin_required +def proposal_edit(proposal_id): + if request.method == "POST": + data = request.form + slug = data.get("slug", "").strip() + if not slug: + import re + slug = re.sub(r'[^a-z0-9]+', '-', data["title"].lower()).strip('-') + gap_ids = [int(g_val) for g_val in request.form.getlist("gap_ids") if g_val] + proposal = { + "id": proposal_id, + "title": data["title"], + "slug": slug, + "status": data.get("status", "idea"), + "description": data.get("description", ""), + "content_md": data.get("content_md", ""), + "source_paper": data.get("source_paper", ""), + "source_url": data.get("source_url", ""), + "intended_wg": data.get("intended_wg", ""), + "draft_name": data.get("draft_name", ""), + "gap_ids": gap_ids, + } + db().upsert_proposal(proposal) + return redirect(url_for("admin.proposal_detail", proposal_id=proposal_id)) + proposal = get_proposal_detail(db(), proposal_id) + if not proposal: + abort(404) + gap_list = get_all_gaps(db()) + return render_template("proposal_edit.html", proposal=proposal, gaps=gap_list) + + +@admin_bp.route("/proposals//delete", methods=["POST"]) +@admin_required +def proposal_delete(proposal_id): + db().delete_proposal(proposal_id) + return redirect(url_for("admin.proposals")) + + +@admin_bp.route("/api/proposals") +@admin_required +def api_proposals(): + data = get_all_proposals(db()) + return jsonify(data) + + +@admin_bp.route("/api/proposals/") +@admin_required +def api_proposal_detail(proposal_id): + p = get_proposal_detail(db(), proposal_id) + if not p: + return jsonify({"error": "Proposal not found"}), 404 + return jsonify(p) + + +@admin_bp.route("/proposals/intake", methods=["GET", "POST"]) +@admin_required +def proposal_intake(): + """Paste text/URLs -> Claude generates proposals automatically.""" + if request.method == "POST": + raw_input = request.form.get("input_text", "").strip() + if not raw_input: + return jsonify({"error": "No input provided"}), 400 + + try: + from ietf_analyzer.config import Config + from ietf_analyzer.proposal_intake import ProposalIntake + + cfg = Config.load() + intake = ProposalIntake(cfg, db()) + proposals_result, usage = intake.process(raw_input, cheap=True) + + return jsonify({ + "success": True, + "count": len(proposals_result), + "proposals": [ + {"id": p.get("id"), "title": p.get("title"), "slug": p.get("slug"), + "gap_ids": p.get("gap_ids", []), "description": p.get("description", ""), + "content_md": p.get("content_md", ""), + "intended_wg": p.get("intended_wg", ""), "draft_name": p.get("draft_name", ""), + "source_paper": p.get("source_paper", ""), "source_url": p.get("source_url", "")} + for p in proposals_result + ], + "usage": usage, + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + return render_template("proposal_intake.html") + + +# ── Obsidian Export ────────────────────────────────────────────────────── + +@admin_bp.route("/export/obsidian") +@admin_required +def export_obsidian(): + """Download the entire research corpus as an Obsidian vault (ZIP).""" + data = build_obsidian_vault(db()) + return Response( + data, + mimetype="application/zip", + headers={"Content-Disposition": "attachment; filename=IETF-AI-Agent-Drafts.zip"}, + ) diff --git a/src/webui/blueprints/api.py b/src/webui/blueprints/api.py new file mode 100644 index 0000000..96f51d8 --- /dev/null +++ b/src/webui/blueprints/api.py @@ -0,0 +1,180 @@ +"""Public API endpoints (JSON responses).""" +from __future__ import annotations + +import csv +import io +import json + +from flask import Blueprint, request, jsonify, g, Response + +from webui.data import ( + get_db, + get_overview_stats, + get_drafts_page, + get_draft_detail, + get_rating_distributions, + get_timeline_data, + get_ideas_by_type, + get_category_counts, + get_author_network_full, + get_citation_graph, + get_idea_clusters, + global_search, + get_architecture, + get_ask_search, +) + +api_bp = Blueprint("api", __name__) + + +def db(): + if "db" not in g: + g.db = get_db() + return g.db + + +def _to_csv_response(rows: list[dict], filename: str = "export.csv") -> Response: + """Convert a list of dicts to a CSV download response.""" + if not rows: + return Response("", mimetype="text/csv", + headers={"Content-Disposition": f"attachment; filename={filename}"}) + si = io.StringIO() + writer = csv.DictWriter(si, fieldnames=rows[0].keys()) + writer.writeheader() + for row in rows: + flat = {} + for k, v in row.items(): + if isinstance(v, (list, dict)): + flat[k] = json.dumps(v) + else: + flat[k] = v + writer.writerow(flat) + return Response(si.getvalue(), mimetype="text/csv", + headers={"Content-Disposition": f"attachment; filename={filename}"}) + + +def _results_to_csv(results: dict) -> Response: + """Convert global search results (multi-category) to a single CSV.""" + rows = [] + for category, items in results.items(): + for item in items: + row = {"_category": category} + row.update(item) + rows.append(row) + return _to_csv_response(rows, "search_results.csv") + + +@api_bp.route("/api/drafts") +def api_drafts(): + page = request.args.get("page", 1, type=int) + search = request.args.get("q", "") + category = request.args.get("cat", "") + source = request.args.get("source", "") + min_score = request.args.get("min_score", 0.0, type=float) + sort = request.args.get("sort", "score") + sort_dir = request.args.get("dir", "desc") + data = get_drafts_page(db(), page=page, search=search, category=category, + min_score=min_score, sort=sort, sort_dir=sort_dir, + source=source) + if request.args.get("format") == "csv": + return _to_csv_response(data.get("drafts", []), "drafts.csv") + return jsonify(data) + + +@api_bp.route("/api/stats") +def api_stats(): + return jsonify(get_overview_stats(db())) + + +@api_bp.route("/api/authors/network") +def api_author_network(): + return jsonify(get_author_network_full(db())) + + +@api_bp.route("/api/citations") +def api_citations(): + min_refs = request.args.get("min_refs", 2, type=int) + return jsonify(get_citation_graph(db(), min_refs=min_refs)) + + +@api_bp.route("/api/search") +def api_search(): + q = request.args.get("q", "").strip() + results = global_search(db(), q) if q else {"drafts": [], "ideas": [], "authors": [], "gaps": []} + if request.args.get("format") == "csv": + return _results_to_csv(results) + return jsonify(results) + + +@api_bp.route("/api/ideas") +def api_ideas(): + data = get_ideas_by_type(db()) + if request.args.get("format") == "csv": + return _to_csv_response(data.get("ideas", []), "ideas.csv") + return jsonify(data) + + +@api_bp.route("/api/ratings") +def api_ratings(): + data = get_rating_distributions(db()) + if request.args.get("format") == "csv": + rows = [] + for i in range(len(data.get("names", []))): + rows.append({ + "name": data["names"][i], + "score": data["scores"][i], + "novelty": data["novelty"][i], + "maturity": data["maturity"][i], + "overlap": data["overlap"][i], + "momentum": data["momentum"][i], + "relevance": data["relevance"][i], + "category": data["categories"][i], + }) + return _to_csv_response(rows, "ratings.csv") + return jsonify(data) + + +@api_bp.route("/api/timeline") +def api_timeline(): + data = get_timeline_data(db()) + return jsonify(data) + + +@api_bp.route("/api/idea-clusters") +def api_idea_clusters(): + data = get_idea_clusters(db()) + return jsonify(data) + + +@api_bp.route("/api/categories") +def api_categories(): + data = get_category_counts(db()) + if request.args.get("format") == "csv": + rows = [{"category": k, "count": v} for k, v in data.items()] + return _to_csv_response(rows, "categories.csv") + return jsonify(data) + + +@api_bp.route("/api/drafts/") +def api_draft_detail(name: str): + detail = get_draft_detail(db(), name) + if not detail: + return jsonify({"error": "Draft not found"}), 404 + return jsonify(detail) + + +@api_bp.route("/api/architecture") +def api_architecture(): + return jsonify(get_architecture(db())) + + +@api_bp.route("/api/ask", methods=["POST"]) +def api_ask(): + """Search only (free). Returns JSON with sources + cached answer if available.""" + data = request.get_json(force=True, silent=True) + if not data or "question" not in data: + return jsonify({"error": "Missing 'question' in request body"}), 400 + question = data["question"] + top_k = data.get("top_k", 5) + result = get_ask_search(db(), question, top_k=top_k) + return jsonify(result) diff --git a/src/webui/blueprints/pages.py b/src/webui/blueprints/pages.py new file mode 100644 index 0000000..45636d2 --- /dev/null +++ b/src/webui/blueprints/pages.py @@ -0,0 +1,206 @@ +"""Public page routes (no admin required).""" +from __future__ import annotations + +from flask import Blueprint, render_template, request, abort, g + +from webui.data import ( + get_db, + get_overview_stats, + get_category_counts, + get_drafts_page, + get_draft_detail, + get_rating_distributions, + get_timeline_data, + get_timeline_animation_data, + get_ideas_by_type, + get_top_authors, + get_org_data, + get_category_radar_data, + get_score_histogram, + get_author_network_full, + get_cross_org_data, + get_citation_graph, + get_idea_clusters, + get_category_summary, + global_search, + get_architecture, + get_ask_search, + get_citation_influence, + get_bcp_analysis, +) + +pages_bp = Blueprint("pages", __name__) + + +def db(): + if "db" not in g: + g.db = get_db() + return g.db + + +@pages_bp.route("/") +def overview(): + stats = get_overview_stats(db()) + categories = get_category_counts(db()) + timeline = get_timeline_data(db()) + scores = get_score_histogram(db()) + radar = get_category_radar_data(db()) + return render_template( + "overview.html", + stats=stats, + categories=categories, + timeline=timeline, + scores=scores, + radar=radar, + ) + + +@pages_bp.route("/drafts") +def drafts(): + page = request.args.get("page", 1, type=int) + search = request.args.get("q", "") + category = request.args.get("cat", "") + source = request.args.get("source", "") + min_score = request.args.get("min_score", 0.0, type=float) + sort = request.args.get("sort", "score") + sort_dir = request.args.get("dir", "desc") + + result = get_drafts_page( + db(), + page=page, + search=search, + category=category, + min_score=min_score, + sort=sort, + sort_dir=sort_dir, + source=source, + ) + categories = get_category_counts(db()) + cat_summary = get_category_summary(db(), category) if category else None + return render_template( + "drafts.html", + result=result, + categories=categories, + cat_summary=cat_summary, + search=search, + current_cat=category, + current_source=source, + min_score=min_score, + sort=sort, + sort_dir=sort_dir, + ) + + +@pages_bp.route("/drafts/") +def draft_detail(name: str): + database = db() + detail = get_draft_detail(database, name) + if not detail: + abort(404) + # Build set of draft ref IDs that exist in our DB for internal linking + ref_draft_ids = [r["id"] for r in detail.get("refs", []) if r["type"] == "draft"] + known_drafts = set() + if ref_draft_ids: + placeholders = ",".join("?" * len(ref_draft_ids)) + rows = database.conn.execute( + f"SELECT name FROM drafts WHERE name IN ({placeholders})", ref_draft_ids + ).fetchall() + known_drafts = {r["name"] for r in rows} + return render_template("draft_detail.html", draft=detail, known_drafts=known_drafts) + + +@pages_bp.route("/ideas") +def ideas(): + data = get_ideas_by_type(db()) + return render_template("ideas.html", data=data) + + +@pages_bp.route("/ratings") +def ratings(): + distributions = get_rating_distributions(db()) + radar = get_category_radar_data(db()) + return render_template( + "ratings.html", + dist=distributions, + radar=radar, + ) + + +@pages_bp.route("/timeline") +def timeline_animation(): + data = get_timeline_animation_data(db()) + return render_template("timeline.html", animation=data) + + +@pages_bp.route("/idea-clusters") +def idea_clusters(): + data = get_idea_clusters(db()) + return render_template("idea_clusters.html", clusters=data) + + +@pages_bp.route("/architecture") +def architecture(): + data = get_architecture(db()) + return render_template("architecture.html", arch=data) + + +@pages_bp.route("/authors") +def authors(): + top = get_top_authors(db(), limit=50) + orgs = get_org_data(db(), limit=20) + network = get_author_network_full(db()) + cross_org = get_cross_org_data(db(), limit=20) + return render_template( + "authors.html", + authors=top, + orgs=orgs, + orgs_data=orgs, + network=network, + cross_org=cross_org, + ) + + +@pages_bp.route("/citations") +def citations(): + from webui.auth import is_admin as check_admin + graph = get_citation_graph(db()) + influence = get_citation_influence(db()) if check_admin() else None + bcp = get_bcp_analysis(db()) if check_admin() else None + return render_template("citations.html", graph=graph, influence=influence, bcp=bcp) + + +@pages_bp.route("/about") +def about(): + from ietf_analyzer.config import Config + cfg = Config.load() + stats = get_overview_stats(db()) + return render_template("about.html", stats=stats, search_keywords=cfg.search_keywords, + fetch_since=cfg.fetch_since) + + +@pages_bp.route("/impressum") +def impressum(): + return render_template("impressum.html") + + +@pages_bp.route("/datenschutz") +def datenschutz(): + return render_template("datenschutz.html") + + +@pages_bp.route("/search") +def search(): + q = request.args.get("q", "").strip() + results = global_search(db(), q) if q else {"drafts": [], "ideas": [], "authors": [], "gaps": []} + total = sum(len(v) for v in results.values()) + return render_template("search_results.html", query=q, results=results, total=total) + + +@pages_bp.route("/ask") +def ask_page(): + question = request.args.get("q", "") + result = None + if question: + top_k = request.args.get("top", 5, type=int) + result = get_ask_search(db(), question, top_k=top_k) + return render_template("ask.html", question=question, result=result) diff --git a/src/webui/data.py b/src/webui/data.py deleted file mode 100644 index 6a55eea..0000000 --- a/src/webui/data.py +++ /dev/null @@ -1,4254 +0,0 @@ -"""Data access layer for the web dashboard. - -Thin wrapper around ietf_analyzer.db.Database that returns plain dicts -ready for JSON serialization or Jinja2 template rendering. -""" - -from __future__ import annotations - -import json -import re -import sys -import time -from collections import Counter, defaultdict -from functools import lru_cache -from pathlib import Path -from typing import TypedDict - -import numpy as np -from sklearn.cluster import AgglomerativeClustering -from sklearn.manifold import TSNE -from sklearn.preprocessing import normalize as sk_normalize - - -# --------------------------------------------------------------------------- -# TypedDicts for common return shapes -# --------------------------------------------------------------------------- - -class OverviewStats(TypedDict): - """High-level dashboard statistics from :func:`get_overview_stats`.""" - total_drafts: int - rated_count: int - author_count: int - idea_count: int - gap_count: int - input_tokens: int - output_tokens: int - false_positive_count: int - - -class DraftListItem(TypedDict): - """Single draft in the paginated listing from :func:`get_drafts_page`.""" - name: str - title: str - date: str | None - url: str - pages: int - group: str - source: str - score: float - novelty: float - maturity: float - overlap: float - momentum: float - relevance: float - categories: list[str] - summary: str - readiness: float - - -class DraftsPage(TypedDict): - """Paginated draft listing from :func:`get_drafts_page`.""" - drafts: list[DraftListItem] - total: int - page: int - per_page: int - pages: int - - -class AuthorInfo(TypedDict): - """Author entry from :func:`get_top_authors`.""" - name: str - affiliation: str - draft_count: int - drafts: list[str] - - -class AuthorNetworkNode(TypedDict): - """Node in the author network graph.""" - id: str - name: str - org: str - draft_count: int - avg_score: float - drafts: list[str] - - -class AuthorNetworkEdge(TypedDict): - """Edge in the author network graph.""" - source: str - target: str - weight: int - - -class AuthorCluster(TypedDict): - """Cluster in the author network.""" - id: int - members: list[str] - org_mix: dict[str, int] - size: int - drafts: list[dict[str, str]] - draft_count: int - - -class AuthorNetwork(TypedDict): - """Full author network from :func:`get_author_network_full`.""" - nodes: list[AuthorNetworkNode] - edges: list[AuthorNetworkEdge] - clusters: list[AuthorCluster] - - -class SimilarityGraphStats(TypedDict): - """Stats sub-dict in similarity graph.""" - node_count: int - edge_count: int - avg_similarity: float - - -class SimilarityGraph(TypedDict): - """Draft similarity network from :func:`get_similarity_graph`.""" - nodes: list[dict] - edges: list[dict] - stats: SimilarityGraphStats - - -class TimelineData(TypedDict): - """Monthly category counts from :func:`get_timeline_data`.""" - months: list[str] - series: dict[str, list[int]] - categories: list[str] - - -class MonitorCost(TypedDict): - """Cost sub-dict in monitor status.""" - input_tokens: int - output_tokens: int - estimated_usd: float - - -class MonitorPipeline(TypedDict): - """Pipeline sub-dict in monitor status.""" - total_drafts: int - rated: int - embedded: int - with_ideas: int - idea_total: int - gap_count: int - - -class MonitorStatus(TypedDict): - """Monitor status from :func:`get_monitor_status`.""" - last_run: dict | None - runs: list[dict] - unprocessed: dict[str, int] - total_runs: int - pipeline: MonitorPipeline - cost: MonitorCost - - -class SearchResults(TypedDict): - """Global search results from :func:`global_search`.""" - drafts: list[dict] - ideas: list[dict] - authors: list[dict] - gaps: list[dict] - - -class CitationGraphStats(TypedDict): - """Stats sub-dict in citation graph.""" - node_count: int - edge_count: int - rfc_count: int - draft_count: int - - -class CitationGraph(TypedDict): - """Citation network from :func:`get_citation_graph`.""" - nodes: list[dict] - edges: list[dict] - stats: CitationGraphStats - -# Add project root to path so we can import ietf_analyzer -_project_root = Path(__file__).resolve().parent.parent.parent -if str(_project_root) not in sys.path: - sys.path.insert(0, str(_project_root / "src")) - -from ietf_analyzer.config import Config -from ietf_analyzer.db import Database -from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch -from ietf_analyzer.search import HybridSearch - -def _extract_month(time_str: str | None) -> str: - """Normalize a date string to YYYY-MM format.""" - if not time_str: - return "unknown" - if len(time_str) >= 7 and time_str[4] == '-': - return time_str[:7] # Already YYYY-MM-DD - if len(time_str) >= 6 and time_str[:4].isdigit(): - return time_str[:4] + '-' + time_str[4:6] # YYYYMMDD → YYYY-MM - return time_str[:7] - - -# Simple TTL cache for expensive computations (t-SNE, clustering, similarity) -_cache: dict[str, tuple[float, object]] = {} -_CACHE_TTL = 300 # 5 minutes - - -def _cached(key: str, fn, ttl: float = _CACHE_TTL): - """Return cached result or compute and cache it.""" - now = time.monotonic() - if key in _cache: - ts, val = _cache[key] - if now - ts < ttl: - return val - val = fn() - _cache[key] = (now, val) - return val - - -def get_db() -> Database: - """Get a Database instance using default config.""" - config = Config.load() - return Database(config) - - -def get_overview_stats(db: Database) -> OverviewStats: - """Return high-level stats for the dashboard home page. - - Excludes drafts flagged as false positives from rated counts. - """ - total_drafts = db.count_drafts(include_false_positives=False) - rated_pairs = db.drafts_with_ratings(limit=1000) # already excludes FPs - rated_count = len(rated_pairs) - author_count = db.author_count() - idea_count = db.idea_count() - gaps = db.all_gaps() - input_tok, output_tok = db.total_tokens_used() - - # Count false positives separately for transparency - total_all = db.count_drafts(include_false_positives=True) - false_positive_count = total_all - total_drafts - - return { - "total_drafts": total_drafts, - "rated_count": rated_count, - "author_count": author_count, - "idea_count": idea_count, - "gap_count": len(gaps), - "input_tokens": input_tok, - "output_tokens": output_tok, - "false_positive_count": false_positive_count, - } - - -def get_category_counts(db: Database) -> dict[str, int]: - """Return {category: draft_count} for all categories.""" - return db.category_counts() - - -def get_category_summary(db: Database, category: str) -> dict | None: - """Build a data-driven summary for a category. Returns None if category not found.""" - pairs = db.drafts_with_ratings(limit=2000) - all_authors = db.top_authors(limit=500) - - # Filter to drafts in this category - cat_pairs = [(d, r) for d, r in pairs if category in r.categories] - if not cat_pairs: - return None - - # Author lookup: draft_name -> [author names] - author_drafts_map: dict[str, list[str]] = defaultdict(list) - for name, aff, cnt, drafts in all_authors: - for dn in drafts: - author_drafts_map[dn].append(name) - - # Dimension averages - n = len(cat_pairs) - avg = lambda vals: round(sum(vals) / len(vals), 1) if vals else 0 - novelty_vals = [r.novelty for _, r in cat_pairs] - maturity_vals = [r.maturity for _, r in cat_pairs] - overlap_vals = [r.overlap for _, r in cat_pairs] - momentum_vals = [r.momentum for _, r in cat_pairs] - relevance_vals = [r.relevance for _, r in cat_pairs] - scores = [r.composite_score for _, r in cat_pairs] - - # Top drafts - sorted_pairs = sorted(cat_pairs, key=lambda p: p[1].composite_score, reverse=True) - top_3 = [(d.name, d.title, round(r.composite_score, 1)) for d, r in sorted_pairs[:3]] - - # Top authors in this category - author_counter: Counter = Counter() - org_counter: Counter = Counter() - author_aff: dict[str, str] = {} - for name, aff, cnt, drafts in all_authors: - author_aff[name] = aff or "" - for d, r in cat_pairs: - for a in author_drafts_map.get(d.name, []): - author_counter[a] += 1 - if author_aff.get(a): - org_counter[author_aff[a]] += 1 - top_authors = author_counter.most_common(5) - top_orgs = org_counter.most_common(5) - - # Strongest and weakest dimensions - dim_avgs = { - "Novelty": avg(novelty_vals), - "Maturity": avg(maturity_vals), - "Overlap": avg(overlap_vals), - "Momentum": avg(momentum_vals), - "Relevance": avg(relevance_vals), - } - strongest = max(dim_avgs, key=dim_avgs.get) - weakest = min(dim_avgs, key=dim_avgs.get) - - # Activity trend: how many are recent (last 6 months)? - recent = sum(1 for d, _ in cat_pairs if d.time and d.time >= "2025-09") - total_all = len(pairs) - - # Build text summary - lines = [] - lines.append(f"**{n} drafts** ({n * 100 // total_all}% of all rated drafts) " - f"with an average composite score of **{avg(scores):.1f}/5.0**.") - - # Dimension profile - lines.append(f"Strongest dimension: **{strongest}** ({dim_avgs[strongest]}), " - f"weakest: **{weakest}** ({dim_avgs[weakest]}).") - - # Maturity vs novelty insight - if dim_avgs["Maturity"] < 2.5 and dim_avgs["Novelty"] >= 3.0: - lines.append("This category has **high novelty but low maturity** — many early-stage proposals with fresh ideas that haven't been fully developed yet.") - elif dim_avgs["Maturity"] >= 3.0 and dim_avgs["Novelty"] < 2.5: - lines.append("This category is **mature but less novel** — established approaches being refined rather than introducing fundamentally new concepts.") - elif dim_avgs["Maturity"] >= 3.0 and dim_avgs["Novelty"] >= 3.0: - lines.append("This category shows **both high novelty and maturity** — well-developed proposals with genuinely new contributions.") - - # Overlap insight - if dim_avgs["Overlap"] >= 3.5: - lines.append(f"High overlap ({dim_avgs['Overlap']}) suggests **significant duplication** — multiple drafts cover similar ground, which may indicate convergence or fragmentation.") - elif dim_avgs["Overlap"] <= 2.0: - lines.append(f"Low overlap ({dim_avgs['Overlap']}) indicates **diverse approaches** — drafts in this category tackle distinct problems with little redundancy.") - - # Activity - if recent > 0: - lines.append(f"**{recent} draft{'s' if recent != 1 else ''}** submitted in the last 6 months, " - f"suggesting {'active' if recent >= 3 else 'moderate'} development.") - - return { - "text": " ".join(lines), - "count": n, - "avg_score": avg(scores), - "dimensions": dim_avgs, - "top_drafts": top_3, - "top_authors": top_authors, - "top_orgs": top_orgs, - "strongest": strongest, - "weakest": weakest, - } - - -def get_drafts_page( - db: Database, - page: int = 1, - per_page: int = 50, - search: str = "", - category: str = "", - min_score: float = 0.0, - sort: str = "score", - sort_dir: str = "desc", - source: str = "", -) -> DraftsPage: - """Return a paginated, filtered list of drafts with ratings. - - Returns dict with keys: drafts, total, page, per_page, pages. - """ - pairs = db.drafts_with_ratings(limit=1000) - - # Build author lookup for search (draft_name -> "author1 author2 ...") - author_text_by_draft: dict[str, str] = {} - if search: - rows = db.conn.execute( - """SELECT da.draft_name, GROUP_CONCAT(a.name, ' ') as names - FROM draft_authors da JOIN authors a ON da.person_id = a.person_id - GROUP BY da.draft_name""" - ).fetchall() - for r in rows: - author_text_by_draft[r[0]] = r[1] or "" - - # Filter - filtered = [] - for draft, rating in pairs: - if min_score > 0 and rating.composite_score < min_score: - continue - if category and category not in rating.categories: - continue - if source and draft.source != source: - continue - if search: - author_names = author_text_by_draft.get(draft.name, "") - haystack = f"{draft.name} {draft.title} {rating.summary} {author_names}".lower() - if not all(w in haystack for w in search.lower().split()): - continue - filtered.append((draft, rating)) - - # Sort - sort_keys = { - "score": lambda p: p[1].composite_score, - "name": lambda p: p[0].name, - "date": lambda p: p[0].time or "", - "novelty": lambda p: p[1].novelty, - "maturity": lambda p: p[1].maturity, - "relevance": lambda p: p[1].relevance, - "overlap": lambda p: p[1].overlap, - "momentum": lambda p: p[1].momentum, - "readiness": lambda p: (1.0 if p[0].name.startswith("draft-ietf-") else 0.0) * 0.25 + - min(int(p[0].rev or "0") / 5.0, 1.0) * 0.15 + - ((p[1].momentum - 1) / 4.0) * 0.15, - } - key_fn = sort_keys.get(sort, sort_keys["score"]) - reverse = sort_dir == "desc" - filtered.sort(key=key_fn, reverse=reverse) - - total = len(filtered) - pages = max(1, (total + per_page - 1) // per_page) - page = max(1, min(page, pages)) - start = (page - 1) * per_page - page_items = filtered[start : start + per_page] - - # Pre-compute readiness in batch (~6 queries total instead of ~200) - - readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items]) - - drafts = [] - for draft, rating in page_items: - r_score = readiness_cache.get(draft.name, {}).get("score", 0) - drafts.append({ - "name": draft.name, - "title": draft.title, - "date": draft.date, - "url": draft.source_url if draft.source != "ietf" else draft.datatracker_url, - "pages": draft.pages or 0, - "group": draft.group or "individual", - "source": draft.source or "ietf", - "score": round(rating.composite_score, 2), - "novelty": rating.novelty, - "maturity": rating.maturity, - "overlap": rating.overlap, - "momentum": rating.momentum, - "relevance": rating.relevance, - "categories": rating.categories, - "summary": rating.summary, - "readiness": r_score, - }) - - return { - "drafts": drafts, - "total": total, - "page": page, - "per_page": per_page, - "pages": pages, - } - - -def get_draft_detail(db: Database, name: str) -> dict | None: - """Return full detail for a single draft.""" - draft = db.get_draft(name) - if not draft: - return None - - rating = db.get_rating(name) - authors = db.get_authors_for_draft(name) - ideas = db.get_ideas_for_draft(name) - refs = db.get_refs_for_draft(name) - - result = { - "name": draft.name, - "title": draft.title, - "rev": draft.rev, - "abstract": draft.abstract, - "date": draft.date, - "time": draft.time, - "url": draft.datatracker_url, - "text_url": draft.text_url, - "pages": draft.pages, - "words": draft.words, - "group": draft.group or "individual", - "categories": draft.categories, - "tags": draft.tags, - "authors": [ - {"name": a.name, "affiliation": a.affiliation, "person_id": a.person_id} - for a in authors - ], - "ideas": ideas, - "refs": [{"type": t, "id": rid} for t, rid in refs], - } - - if rating: - result["rating"] = { - "score": round(rating.composite_score, 2), - "novelty": rating.novelty, - "maturity": rating.maturity, - "overlap": rating.overlap, - "momentum": rating.momentum, - "relevance": rating.relevance, - "summary": rating.summary, - "novelty_note": rating.novelty_note, - "maturity_note": rating.maturity_note, - "overlap_note": rating.overlap_note, - "momentum_note": rating.momentum_note, - "relevance_note": rating.relevance_note, - "categories": rating.categories, - } - - # Readiness score - - result["readiness"] = compute_readiness(db, name) - - # Annotation - annotation = db.get_annotation(name) - result["annotation"] = annotation - - return result - - -def get_rating_distributions(db: Database) -> dict: - """Return arrays for each rating dimension, suitable for Plotly.""" - pairs = db.drafts_with_ratings(limit=1000) - dims = { - "novelty": [], - "maturity": [], - "overlap": [], - "momentum": [], - "relevance": [], - "scores": [], - "categories": [], - "names": [], - "sources": [], - } - for draft, rating in pairs: - dims["novelty"].append(rating.novelty) - dims["maturity"].append(rating.maturity) - dims["overlap"].append(rating.overlap) - dims["momentum"].append(rating.momentum) - dims["relevance"].append(rating.relevance) - dims["scores"].append(round(rating.composite_score, 2)) - dims["categories"].append(rating.categories[0] if rating.categories else "Other") - dims["names"].append(draft.name) - dims["sources"].append(getattr(draft, "source", "ietf") or "ietf") - return dims - - -def get_timeline_data(db: Database) -> TimelineData: - """Return monthly counts by category for timeline chart.""" - pairs = db.drafts_with_ratings(limit=1000) - all_drafts = db.list_drafts(limit=1000, order_by="time ASC") - rating_map = {d.name: r for d, r in pairs} - - month_cat: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) - for d in all_drafts: - month = _extract_month(d.time) - r = rating_map.get(d.name) - if r: - cat = r.categories[0] if r.categories else "Other" - month_cat[month][cat] += 1 - - months = sorted(month_cat.keys()) - cat_totals: Counter = Counter() - for mc in month_cat.values(): - for c, cnt in mc.items(): - cat_totals[c] += cnt - top_cats = [c for c, _ in cat_totals.most_common(10)] - - series = {} - for cat in top_cats: - series[cat] = [month_cat[m].get(cat, 0) for m in months] - - return {"months": months, "series": series, "categories": top_cats} - - -def get_ideas_by_type(db: Database) -> dict: - """Return ideas grouped by type with counts.""" - all_ideas = db.all_ideas() - type_counts = Counter(i.get("type", "other") or "other" for i in all_ideas) - return { - "total": len(all_ideas), - "by_type": dict(type_counts.most_common()), - "ideas": all_ideas, - } - - -def get_all_gaps(db: Database) -> list[dict]: - """Return all gap analysis results, sorted by severity (critical first).""" - _sev_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} - gaps = db.all_gaps() - gaps.sort(key=lambda g: _sev_order.get(g.get("severity", "low"), 99)) - return gaps - - -def get_gap_detail(db: Database, gap_id: int) -> dict | None: - """Return a single gap by ID, or None if not found.""" - gaps = db.all_gaps() - for g in gaps: - if g["id"] == gap_id: - return g - return None - - -def get_generated_drafts() -> list[dict]: - """Return list of pre-generated draft files in data/reports/generated-drafts/.""" - drafts_dir = _project_root / "data" / "reports" / "generated-drafts" - if not drafts_dir.exists(): - return [] - results = [] - for f in sorted(drafts_dir.glob("draft-*.txt")): - # Extract title from first non-empty content line after header - title = f.stem - text = f.read_text(errors="replace") - for line in text.splitlines(): - stripped = line.strip() - if stripped and not stripped.startswith("Internet-Draft") and \ - not stripped.startswith("Intended status") and \ - not stripped.startswith("Expires:") and stripped != "": - title = stripped - break - results.append({ - "filename": f.name, - "stem": f.stem, - "title": title, - "size": f.stat().st_size, - "path": str(f), - }) - return results - - -def read_generated_draft(filename: str) -> str | None: - """Read a generated draft file by filename. Returns text or None.""" - drafts_dir = _project_root / "data" / "reports" / "generated-drafts" - path = drafts_dir / filename - if not path.exists() or not path.is_file(): - return None - # Safety: ensure we're not reading outside the directory - if not str(path.resolve()).startswith(str(drafts_dir.resolve())): - return None - return path.read_text(errors="replace") - - -def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]: - """Return top authors by draft count.""" - rows = db.top_authors(limit=limit) - return [ - {"name": name, "affiliation": aff, "draft_count": cnt, "drafts": drafts} - for name, aff, cnt, drafts in rows - ] - - -def get_org_data(db: Database, limit: int = 20) -> list[dict]: - """Return organization contribution data.""" - rows = db.top_orgs(limit=limit) - return [ - {"org": org, "author_count": authors, "draft_count": drafts} - for org, authors, drafts in rows - ] - - -def get_category_radar_data(db: Database) -> dict: - """Return average rating profiles per category for radar chart.""" - pairs = db.drafts_with_ratings(limit=1000) - cat_ratings: dict[str, list] = defaultdict(list) - for _, r in pairs: - for c in r.categories: - cat_ratings[c].append(r) - - top_cats = sorted(cat_ratings.keys(), key=lambda c: len(cat_ratings[c]), reverse=True)[:8] - result = {} - for cat in top_cats: - ratings = cat_ratings[cat] - n = len(ratings) - result[cat] = { - "count": n, - "novelty": round(sum(r.novelty for r in ratings) / n, 2), - "maturity": round(sum(r.maturity for r in ratings) / n, 2), - "relevance": round(sum(r.relevance for r in ratings) / n, 2), - "momentum": round(sum(r.momentum for r in ratings) / n, 2), - "low_overlap": round(sum(6 - r.overlap for r in ratings) / n, 2), - } - return result - - -def get_score_histogram(db: Database) -> list[float]: - """Return list of composite scores for histogram.""" - pairs = db.drafts_with_ratings(limit=1000) - return [round(r.composite_score, 2) for _, r in pairs] - - -def get_coauthor_network(db: Database, min_shared: int = 1) -> dict: - """Return co-authorship network data for force-directed graph. - - Returns {nodes: [{id, name, org, draft_count}], edges: [{source, target, weight}]} - """ - pairs = db.coauthor_pairs() - top = db.top_authors(limit=100) - - # Build node set from authors who have co-authorships - author_info = {name: {"org": aff, "draft_count": cnt} for name, aff, cnt, _ in top} - node_set = set() - edges = [] - for a, b, shared in pairs: - if shared >= min_shared: - node_set.add(a) - node_set.add(b) - edges.append({"source": a, "target": b, "weight": shared}) - - nodes = [] - for name in node_set: - info = author_info.get(name, {"org": "", "draft_count": 1}) - nodes.append({ - "id": name, - "name": name, - "org": info["org"], - "draft_count": info["draft_count"], - }) - - return {"nodes": nodes, "edges": edges} - - -def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: - """Return draft similarity network (cached).""" - return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold)) - - -def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: - """Return draft similarity network for force-directed graph. - - Returns {nodes: [{name, title, category, score}], - edges: [{source, target, similarity}], - stats: {node_count, edge_count, avg_similarity}} - """ - - - embeddings = db.all_embeddings() - if len(embeddings) < 2: - return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}} - - pairs = db.drafts_with_ratings(limit=1000) - rating_map = {d.name: r for d, r in pairs} - draft_map = {d.name: d for d, _ in pairs} - - # Filter to drafts with both embeddings and ratings - names = [n for n in embeddings if n in rating_map] - if len(names) < 2: - return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}} - - matrix = np.array([embeddings[n] for n in names]) - - # L2-normalize and compute cosine similarity - norms = np.linalg.norm(matrix, axis=1, keepdims=True) - norms[norms == 0] = 1.0 - normalized = matrix / norms - sim_matrix = normalized @ normalized.T - - # Find pairs above threshold (upper triangle only) - edges = [] - node_set = set() - for i in range(len(names)): - for j in range(i + 1, len(names)): - sim = float(sim_matrix[i, j]) - if sim >= threshold: - edges.append({"source": names[i], "target": names[j], "similarity": round(sim, 4)}) - node_set.add(names[i]) - node_set.add(names[j]) - - # Build nodes from connected drafts only - nodes = [] - for name in names: - if name not in node_set: - continue - r = rating_map[name] - d = draft_map.get(name) - nodes.append({ - "name": name, - "title": d.title if d else name, - "category": r.categories[0] if r.categories else "Other", - "score": round(r.composite_score, 2), - }) - - avg_sim = round(sum(e["similarity"] for e in edges) / max(len(edges), 1), 4) - - return { - "nodes": nodes, - "edges": edges, - "stats": {"node_count": len(nodes), "edge_count": len(edges), "avg_similarity": avg_sim}, - } - - -def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]: - """Return cross-org collaboration pairs.""" - rows = db.cross_org_collaborations(limit=limit) - return [ - {"org_a": a, "org_b": b, "shared_drafts": cnt} - for a, b, cnt in rows - ] - - -def get_author_network_full(db: Database) -> AuthorNetwork: - """Return author network (cached for 5 min).""" - return _cached("author_network", lambda: _compute_author_network_full(db)) - - -def _compute_author_network_full(db: Database) -> AuthorNetwork: - """Return enriched co-authorship network with avg scores and cluster info. - - Returns { - nodes: [{id, name, org, draft_count, avg_score, drafts: [name,...]}], - edges: [{source, target, weight}], - clusters: [{id, members: [name,...], org_mix: {org: count}, size}], - } - """ - pairs = db.coauthor_pairs() - top = db.top_authors(limit=500) - - # Build rating lookup for avg scores - rated = db.drafts_with_ratings(limit=2000) - draft_score = {d.name: r.composite_score for d, r in rated} - - # Author info map - author_info = {} - for name, aff, cnt, drafts in top: - scores = [draft_score[dn] for dn in drafts if dn in draft_score] - avg = round(sum(scores) / len(scores), 2) if scores else 0 - author_info[name] = { - "org": aff, "draft_count": cnt, "drafts": drafts, "avg_score": avg - } - - # Build node set: authors with meaningful collaboration (2+ shared drafts) - node_set = set() - edges = [] - for a, b, shared in pairs: - if shared >= 2: - node_set.add(a) - node_set.add(b) - edges.append({"source": a, "target": b, "weight": shared}) - - # Also include authors with 3+ drafts even if no co-authorships - for name, info in author_info.items(): - if info["draft_count"] >= 3: - node_set.add(name) - - nodes = [] - for name in node_set: - info = author_info.get(name, {"org": "", "draft_count": 1, "drafts": [], "avg_score": 0}) - nodes.append({ - "id": name, - "name": name, - "org": info["org"], - "draft_count": info["draft_count"], - "avg_score": info["avg_score"], - "drafts": info["drafts"][:8], # cap for JSON size - }) - - # Cluster detection via connected components (BFS) - adjacency: dict[str, set[str]] = defaultdict(set) - for e in edges: - adjacency[e["source"]].add(e["target"]) - adjacency[e["target"]].add(e["source"]) - - visited: set[str] = set() - clusters = [] - - # Batch-load all drafts referenced by authors (avoid N+1 in cluster loop) - _all_dn = set() - for _ai in author_info.values(): - _all_dn.update(_ai.get("drafts", [])) - _all_drafts_map = db.get_drafts_by_names(list(_all_dn)) - - for node in sorted(node_set): - if node in visited: - continue - component: list[str] = [] - queue = [node] - while queue: - current = queue.pop(0) - if current in visited: - continue - visited.add(current) - component.append(current) - for neighbor in adjacency.get(current, []): - if neighbor not in visited: - queue.append(neighbor) - - if len(component) >= 2: - org_mix: dict[str, int] = Counter() - member_orgs: dict[str, str] = {} - cluster_drafts: dict[str, str] = {} # name -> title - for m in component: - org = author_info.get(m, {}).get("org", "") - if org: - org_mix[org] += 1 - member_orgs[m] = org - for dn in author_info.get(m, {}).get("drafts", []): - if dn not in cluster_drafts: - d = _all_drafts_map.get(dn) - cluster_drafts[dn] = d.title[:80] if d else dn - clusters.append({ - "id": len(clusters), - "members": component, - "member_orgs": member_orgs, - "org_mix": dict(org_mix.most_common()), - "size": len(component), - "drafts": [{"name": n, "title": t} for n, t in list(cluster_drafts.items())], - "draft_count": len(cluster_drafts), - }) - - clusters.sort(key=lambda c: c["size"], reverse=True) - - # Generate meaningful names for clusters - for cl in clusters: - cl["name"] = _author_cluster_name(cl) - - return {"nodes": nodes, "edges": edges, "clusters": clusters} - - -def _normalize_org(name: str) -> str: - """Shorten verbose org names for display.""" - # Remove common suffixes - for suffix in (", Inc.", " Inc.", ", Ltd.", " Ltd.", " Co.", " Technologies", - " Corporation", " Corp.", " Limited", " GmbH", " AG", - " Europe Ltd", " Research", " Systems"): - name = name.replace(suffix, "") - return name.strip().rstrip(",").rstrip("&").rstrip() - - -def _author_cluster_name(cluster: dict) -> str: - """Derive a meaningful name for an author cluster from orgs and draft titles.""" - # Org part: top 1-2 orgs, normalized - raw_orgs = list(cluster.get("org_mix", {}).keys()) - orgs = [] - seen_short: set[str] = set() - for o in raw_orgs: - short = _normalize_org(o) - if short.lower() not in seen_short: - seen_short.add(short.lower()) - orgs.append(short) - if len(orgs) >= 2: - org_label = f"{orgs[0]} + {orgs[1]}" - elif orgs: - org_label = orgs[0] - else: - # Fall back to first member's last name - members = cluster.get("members", []) - org_label = members[0].split()[-1] if members else "Unknown" - - # Topic part: extract common keywords from draft titles - stopwords = { - "a", "an", "the", "of", "for", "in", "to", "and", "on", "with", - "using", "based", "draft", "internet", "ietf", "protocol", "framework", - "requirements", "architecture", "considerations", "use", "cases", "via", - "towards", "over", "from", "into", "between", "specification", "extension", - "extensions", "mechanisms", "mechanism", "version", "new", "general", - } - word_counts: Counter = Counter() - for d in cluster.get("drafts", []): - title = d.get("title", "") - words = re.findall(r"[A-Za-z]{3,}", title) - for w in words: - wl = w.lower() - if wl not in stopwords: - word_counts[wl] += 1 - - # Pick top keyword(s) that appear in multiple drafts - top_words = [w for w, c in word_counts.most_common(3) if c >= 2] - if not top_words: - top_words = [w for w, _ in word_counts.most_common(1)] - - if top_words: - topic = " ".join(w.capitalize() for w in top_words[:2]) - name = f"{org_label} — {topic}" - else: - name = org_label - # Truncate if too long for display - return name if len(name) <= 50 else name[:47] + "…" - - -def get_idea_clusters(db: Database) -> dict: - """Cluster ideas (cached for 5 min).""" - return _cached("idea_clusters", lambda: _compute_idea_clusters(db)) - - -def _compute_idea_clusters(db: Database) -> dict: - """Cluster ideas by embedding similarity, return clusters + t-SNE scatter. - - Uses Ward linkage on L2-normalized embeddings (approximates cosine) with - a target of ~30 clusters for readable groupings. Enriches each cluster - with WG info and category breakdown. - """ - - - embeddings = db.all_idea_embeddings() - if not embeddings: - return {"clusters": [], "scatter": [], "stats": {"total": 0, "clustered": 0, "num_clusters": 0}, "empty": True} - - # Exclude ideas from false-positive drafts - fp_names = db.false_positive_names() - - # Fetch ideas with IDs for metadata lookup - rows = db.conn.execute("SELECT id, title, description, idea_type, draft_name FROM ideas").fetchall() - idea_map = {r["id"]: {"title": r["title"], "description": r["description"], - "type": r["idea_type"], "draft_name": r["draft_name"]} - for r in rows if r["draft_name"] not in fp_names} - - # Remove FP ideas from embeddings too - embeddings = {k: v for k, v in embeddings.items() if k in idea_map} - - # Draft -> WG and category lookup - draft_rows = db.conn.execute('SELECT name, "group", title FROM drafts').fetchall() - draft_wg = {r["name"]: r["group"] or "none" for r in draft_rows} - draft_title_map = {r["name"]: r["title"] for r in draft_rows} - rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings WHERE COALESCE(false_positive, 0) = 0").fetchall() - draft_cats: dict[str, list[str]] = {} - for r in rating_rows: - try: - draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else [] - except (json.JSONDecodeError, TypeError): - draft_cats[r["draft_name"]] = [] - - # Build matrix from embeddings that have matching ideas - idea_ids = [iid for iid in embeddings if iid in idea_map] - if len(idea_ids) < 5: - return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} - - matrix = np.array([embeddings[iid] for iid in idea_ids]) - matrix_norm = sk_normalize(matrix) - - # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size - n_target = max(10, min(40, len(idea_ids) // 12)) - try: - clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward') - labels = clustering.fit_predict(matrix_norm) - except Exception: - return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} - - # Build cluster data - cluster_ideas_map: dict[int, list] = defaultdict(list) - for idx, iid in enumerate(idea_ids): - cluster_ideas_map[labels[idx]].append(iid) - - stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with", - "on", "by", "is", "as", "at", "from", "that", "this", "it", - "based", "using", "protocol", "mechanism", "framework", "system", - "network", "agent", "agents"} - clusters = [] - for cid in sorted(cluster_ideas_map.keys()): - members = cluster_ideas_map[cid] - ideas_in_cluster = [idea_map[iid] for iid in members if iid in idea_map] - if len(ideas_in_cluster) < 2: - continue - - # Theme: most common significant words in titles - words = Counter() - for idea in ideas_in_cluster: - for w in idea["title"].lower().split(): - w_clean = w.strip("()[].,;:-\"'") - if len(w_clean) > 2 and w_clean not in stop: - words[w_clean] += 1 - top_words = [w for w, _ in words.most_common(4)] - theme = " ".join(top_words).title() if top_words else f"Cluster {cid}" - - drafts = list({idea["draft_name"] for idea in ideas_in_cluster}) - - # Enrich: WG breakdown - wg_counts: dict[str, int] = Counter() - cat_counts: dict[str, int] = Counter() - for dname in drafts: - wg = draft_wg.get(dname, "none") - wg_counts[wg] += 1 - for cat in draft_cats.get(dname, []): - cat_counts[cat] += 1 - - wg_list = [{"wg": wg, "count": cnt} for wg, cnt in wg_counts.most_common(5)] - cat_list = [{"cat": cat, "count": cnt} for cat, cnt in cat_counts.most_common(3)] - cross_wg = len([w for w in wg_counts if w != "none"]) >= 2 - - clusters.append({ - "id": len(clusters), - "theme": theme, - "size": len(ideas_in_cluster), - "ideas": ideas_in_cluster[:20], - "drafts": drafts, - "wgs": wg_list, - "categories": cat_list, - "cross_wg": cross_wg, - "wg_count": len(wg_counts), - }) - - clusters.sort(key=lambda c: c["size"], reverse=True) - - # Build mapping: original cluster label -> sorted index - # Each cluster remembers which original label it came from via its member ids - old_label_to_new: dict[int, int] = {} - for new_idx, c in enumerate(clusters): - c["id"] = new_idx - # Find original label for any member of this cluster - for old_cid, members in cluster_ideas_map.items(): - if members and members[0] in [iid for iid in members if iid in idea_map]: - member_titles = {idea_map[m]["title"] for m in members if m in idea_map} - c_titles = {idea["title"] for idea in c["ideas"]} - if member_titles == c_titles or (member_titles & c_titles and len(members) == c["size"]): - old_label_to_new[old_cid] = new_idx - break - - # Fallback: build from idea_id -> label mapping - iid_to_new: dict[int, int] = {} - for old_cid, members in cluster_ideas_map.items(): - new_idx = old_label_to_new.get(old_cid, old_cid) - for iid in members: - iid_to_new[iid] = new_idx - - # t-SNE for scatter - scatter = [] - try: - perp = min(30, len(idea_ids) - 1) - tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500) - coords = tsne.fit_transform(matrix_norm) - - for idx, iid in enumerate(idea_ids): - info = idea_map.get(iid, {}) - scatter.append({ - "x": round(float(coords[idx, 0]), 3), - "y": round(float(coords[idx, 1]), 3), - "cluster_id": iid_to_new.get(iid, int(labels[idx])), - "title": info.get("title", ""), - "draft_name": info.get("draft_name", ""), - "wg": draft_wg.get(info.get("draft_name", ""), ""), - }) - except Exception: - pass - - # --- Cross-cluster links --- - # Find pairs of clusters whose ideas are semantically related - # Use centroid similarity + best idea-pair links - links = [] - if len(clusters) >= 2: - # Build cluster centroids from normalized embeddings - cluster_centroids = {} - cluster_member_indices: dict[int, list[int]] = defaultdict(list) - for idx, iid in enumerate(idea_ids): - cid = iid_to_new.get(iid, int(labels[idx])) - cluster_member_indices[cid].append(idx) - - for cid, indices in cluster_member_indices.items(): - if indices: - centroid = matrix_norm[indices].mean(axis=0) - norm = np.linalg.norm(centroid) - if norm > 0: - cluster_centroids[cid] = centroid / norm - - # Compute pairwise centroid similarity for all cluster pairs - cids_sorted = sorted(cluster_centroids.keys()) - for ci_idx, ci in enumerate(cids_sorted): - for cj in cids_sorted[ci_idx + 1:]: - sim = float(np.dot(cluster_centroids[ci], cluster_centroids[cj])) - if sim < 0.45: - continue - - # Find the best idea pair across these two clusters - best_sim = 0.0 - best_pair = (None, None) - # Sample up to 20 ideas per cluster to keep it fast - ci_members = cluster_member_indices[ci][:20] - cj_members = cluster_member_indices[cj][:20] - for mi in ci_members: - for mj in cj_members: - pair_sim = float(np.dot(matrix_norm[mi], matrix_norm[mj])) - if pair_sim > best_sim: - best_sim = pair_sim - best_pair = (idea_ids[mi], idea_ids[mj]) - - if best_sim < 0.5: - continue - - # Get theme names - ci_theme = next((c["theme"] for c in clusters if c["id"] == ci), f"Cluster {ci}") - cj_theme = next((c["theme"] for c in clusters if c["id"] == cj), f"Cluster {cj}") - - idea_a = idea_map.get(best_pair[0], {}) - idea_b = idea_map.get(best_pair[1], {}) - - links.append({ - "source": ci, - "target": cj, - "source_theme": ci_theme, - "target_theme": cj_theme, - "similarity": round(sim, 3), - "best_pair_sim": round(best_sim, 3), - "idea_a": idea_a.get("title", ""), - "idea_a_draft": idea_a.get("draft_name", ""), - "idea_b": idea_b.get("title", ""), - "idea_b_draft": idea_b.get("draft_name", ""), - }) - - links.sort(key=lambda l: l["best_pair_sim"], reverse=True) - links = links[:50] # cap at top 50 links - - total = len(idea_ids) - clustered = sum(c["size"] for c in clusters) - return { - "clusters": clusters, - "scatter": scatter, - "links": links, - "stats": {"total": total, "clustered": clustered, "num_clusters": len(clusters)}, - "empty": False, - } - - -def get_timeline_animation_data(db: Database) -> dict: - """Timeline animation (cached for 5 min).""" - return _cached("timeline_animation", lambda: _compute_timeline_animation_data(db)) - - -def _compute_timeline_animation_data(db: Database) -> dict: - """Compute t-SNE on all drafts, return points with month info + category_monthly. - - t-SNE is computed once on ALL drafts so coordinates are stable across - animation frames. Each point carries a ``month`` field (YYYY-MM) so the - front-end can build cumulative animation frames. - """ - - - embeddings = db.all_embeddings() - if len(embeddings) < 5: - return {"points": [], "months": [], "category_monthly": {}} - - pairs = db.drafts_with_ratings(limit=1000) - rating_map = {d.name: r for d, r in pairs} - draft_map = {d.name: d for d, _ in pairs} - - # Filter to drafts that have both embeddings and ratings - names = [n for n in embeddings if n in rating_map] - if len(names) < 5: - return {"points": [], "months": [], "category_monthly": {}} - - matrix = np.array([embeddings[n] for n in names]) - - try: - tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), - random_state=42, max_iter=500) - coords = tsne.fit_transform(matrix) - except Exception: - return {"points": [], "months": [], "category_monthly": {}} - - # Build points with month - points = [] - month_set: set[str] = set() - category_monthly: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) - - for i, name in enumerate(names): - r = rating_map[name] - d = draft_map.get(name) - month = _extract_month(d.time if d else None) - cat = r.categories[0] if r.categories else "Other" - month_set.add(month) - category_monthly[month][cat] += 1 - points.append({ - "name": name, - "title": d.title if d else name, - "x": round(float(coords[i, 0]), 3), - "y": round(float(coords[i, 1]), 3), - "category": cat, - "score": round(r.composite_score, 2), - "month": month, - }) - - months = sorted(month_set) - # Convert defaultdict to plain dict for JSON - cat_monthly_plain = {m: dict(cats) for m, cats in category_monthly.items()} - - return { - "points": points, - "months": months, - "category_monthly": cat_monthly_plain, - } - - -def get_monitor_status(db: Database) -> MonitorStatus: - """Return monitoring status data for dashboard.""" - runs = db.get_monitor_runs(limit=20) - last = runs[0] if runs else None - total_drafts = db.count_drafts() - rated_count = len(db.drafts_with_ratings(limit=10000)) - unrated = len(db.unrated_drafts(limit=9999)) - unembedded = len(db.drafts_without_embeddings(limit=9999)) - embedded_count = total_drafts - unembedded - no_ideas = len(db.drafts_without_ideas(limit=9999)) - ideas_count = total_drafts - no_ideas - idea_total = db.idea_count() - gap_count = len(db.all_gaps()) - input_tok, output_tok = db.total_tokens_used() - - # Estimate cost (Sonnet pricing: $3/M input, $15/M output) - est_cost = (input_tok * 3.0 / 1_000_000) + (output_tok * 15.0 / 1_000_000) - - return { - "last_run": last, - "runs": runs, - "unprocessed": {"unrated": unrated, "unembedded": unembedded, "no_ideas": no_ideas}, - "total_runs": len(runs), - "pipeline": { - "total_drafts": total_drafts, - "rated": rated_count, - "embedded": embedded_count, - "with_ideas": ideas_count, - "idea_total": idea_total, - "gap_count": gap_count, - }, - "cost": { - "input_tokens": input_tok, - "output_tokens": output_tok, - "estimated_usd": round(est_cost, 2), - }, - } - - -def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: - """Return citation graph (cached for 5 min).""" - return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs)) - - -def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: - """Return citation network data for force-directed graph. - - Returns {nodes: [{id, type, title, influence, ...}], - edges: [{source, target}], - stats: {node_count, edge_count, ...}} - """ - # Get all references - rows = db.conn.execute( - "SELECT draft_name, ref_type, ref_id FROM draft_refs" - ).fetchall() - - # Count in-degree for each referenced item - in_degree: dict[str, int] = Counter() - edges_raw = [] - for r in rows: - ref_key = f"{r['ref_type']}:{r['ref_id']}" - in_degree[ref_key] += 1 - edges_raw.append((r["draft_name"], ref_key)) - - # Also count drafts as source nodes - draft_out: dict[str, int] = Counter() - for draft_name, _ in edges_raw: - draft_out[draft_name] += 1 - - # Get draft titles for labeling - draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() - draft_titles = {r["name"]: r["title"] for r in draft_rows} - - # Get rating categories for draft coloring - rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() - draft_cats = {} - for r in rating_rows: - try: - cats = json.loads(r["categories"]) if r["categories"] else [] - draft_cats[r["draft_name"]] = cats[0] if cats else "Other" - except Exception: - draft_cats[r["draft_name"]] = "Other" - - # Filter: keep RFCs with min_refs+ references and all drafts that reference them - top_refs = {k: v for k, v in in_degree.items() if v >= min_refs} - - # Build node set - node_set = set() - filtered_edges = [] - for draft_name, ref_key in edges_raw: - if ref_key in top_refs: - node_set.add(draft_name) - node_set.add(ref_key) - filtered_edges.append({"source": draft_name, "target": ref_key}) - - # Limit to ~200 nodes max for readability - if len(node_set) > 250: - # Keep only refs with higher in-degree - sorted_refs = sorted(top_refs.items(), key=lambda x: x[1], reverse=True) - keep_refs = set(k for k, _ in sorted_refs[:80]) - node_set = set() - filtered_edges = [] - for draft_name, ref_key in edges_raw: - if ref_key in keep_refs: - node_set.add(draft_name) - node_set.add(ref_key) - filtered_edges.append({"source": draft_name, "target": ref_key}) - - # Build nodes - nodes = [] - for nid in node_set: - if ":" in nid and not nid.startswith("draft-"): - # It's a reference node (rfc:1234, bcp:14, etc.) - ref_type, ref_id = nid.split(":", 1) - influence = in_degree.get(nid, 0) - if ref_type == "rfc": - try: - title = f"RFC {int(ref_id)}" - except ValueError: - title = f"RFC {ref_id}" - else: - title = f"{ref_type.upper()} {ref_id}" - nodes.append({ - "id": nid, - "type": ref_type, - "title": title, - "influence": influence, - "ref_id": ref_id, - }) - else: - # It's a draft node - influence = in_degree.get(nid, 0) + draft_out.get(nid, 0) - nodes.append({ - "id": nid, - "type": "draft", - "title": draft_titles.get(nid, nid), - "influence": draft_out.get(nid, 0), - "category": draft_cats.get(nid, "Other"), - }) - - # Stats - rfc_count = sum(1 for n in nodes if n["type"] == "rfc") - draft_count = sum(1 for n in nodes if n["type"] == "draft") - - return { - "nodes": nodes, - "edges": filtered_edges, - "stats": { - "node_count": len(nodes), - "edge_count": len(filtered_edges), - "rfc_count": rfc_count, - "draft_count": draft_count, - }, - } - - -def global_search(db: Database, query: str) -> SearchResults: - """Search across drafts (FTS5), ideas, authors, and gaps. - - Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}. - """ - results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []} - if not query or not query.strip(): - return results - - q = query.strip() - - # 1. Drafts via FTS5 - try: - fts_query = re.sub(r'[^\w\s]', '', q) - fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE) - fts_query = re.sub(r'\s+', ' ', fts_query).strip() - if not fts_query: - raise ValueError("empty query after sanitization") - rows = db.conn.execute( - """SELECT d.name, d.title, d.abstract, d.time, d."group" - FROM drafts d - JOIN drafts_fts f ON d.rowid = f.rowid - WHERE drafts_fts MATCH ? - ORDER BY rank - LIMIT 50""", - (fts_query,), - ).fetchall() - for r in rows: - results["drafts"].append({ - "name": r["name"], - "title": r["title"], - "abstract": (r["abstract"] or "")[:200], - "date": r["time"], - "group": r["group"] or "individual", - }) - except Exception: - # FTS5 match can fail on certain query syntax; fall back to LIKE - like = f"%{q}%" - rows = db.conn.execute( - """SELECT name, title, abstract, time, "group" FROM drafts - WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ? - LIMIT 50""", - (like, like, like), - ).fetchall() - for r in rows: - results["drafts"].append({ - "name": r["name"], - "title": r["title"], - "abstract": (r["abstract"] or "")[:200], - "date": r["time"], - "group": r["group"] or "individual", - }) - - # 2. Ideas via LIKE - like = f"%{q}%" - rows = db.conn.execute( - """SELECT id, title, description, idea_type, draft_name FROM ideas - WHERE (title LIKE ? OR description LIKE ?) - AND draft_name NOT IN (SELECT draft_name FROM ratings WHERE false_positive = 1) - ORDER BY id LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["ideas"].append({ - "id": r["id"], - "title": r["title"], - "description": (r["description"] or "")[:200], - "type": r["idea_type"], - "draft_name": r["draft_name"], - }) - - # 3. Authors via LIKE - results["authors"] = db.search_authors(q, limit=50) - - # 4. Gaps via LIKE - results["gaps"] = db.search_gaps(q, limit=50) - - return results - - -def get_landscape_tsne(db: Database) -> list[dict]: - """Compute t-SNE (cached for 5 min).""" - return _cached("landscape_tsne", lambda: _compute_landscape_tsne(db)) - - -def _compute_landscape_tsne(db: Database) -> list[dict]: - """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].""" - - - embeddings = db.all_embeddings() - if len(embeddings) < 5: - return [] - - pairs = db.drafts_with_ratings(limit=1000) - rating_map = {d.name: r for d, r in pairs} - draft_map = {d.name: d for d, _ in pairs} - - # Filter to drafts that have both embeddings and ratings - names = [n for n in embeddings if n in rating_map] - if len(names) < 5: - return [] - - matrix = np.array([embeddings[n] for n in names]) - - try: - tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), - random_state=42, max_iter=500) - coords = tsne.fit_transform(matrix) - except Exception: - return [] - - result = [] - for i, name in enumerate(names): - r = rating_map[name] - d = draft_map.get(name) - result.append({ - "name": name, - "title": d.title if d else name, - "x": round(float(coords[i, 0]), 3), - "y": round(float(coords[i, 1]), 3), - "category": r.categories[0] if r.categories else "Other", - "score": round(r.composite_score, 2), - }) - return result - - -def get_comparison_data(db: Database, names: list[str]) -> dict | None: - """Get comparison data for a list of drafts. - - Returns { - drafts: [{name, title, abstract, rating, ideas, refs, ...}], - shared_ideas: [{title, drafts: [name,...]}], - unique_ideas: {name: [{title, description}]}, - shared_refs: [{type, id, drafts: [name,...]}], - unique_refs: {name: [{type, id}]}, - similarities: [{a, b, similarity}], - comparison_text: str | None, - } - """ - - - drafts_data = [] - all_ideas: dict[str, list[dict]] = {} - all_refs: dict[str, list[tuple[str, str]]] = {} - - for name in names: - detail = get_draft_detail(db, name) - if not detail: - continue - drafts_data.append(detail) - all_ideas[name] = detail.get("ideas", []) - all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])] - - if len(drafts_data) < 2: - return None - - # Find shared vs unique ideas (by title similarity) - idea_title_drafts: dict[str, list[str]] = {} - for name, ideas in all_ideas.items(): - for idea in ideas: - title_lower = idea["title"].lower().strip() - if title_lower not in idea_title_drafts: - idea_title_drafts[title_lower] = [] - idea_title_drafts[title_lower].append(name) - - shared_ideas = [ - {"title": title, "drafts": draft_list} - for title, draft_list in idea_title_drafts.items() - if len(set(draft_list)) > 1 - ] - unique_ideas: dict[str, list[dict]] = {} - for name, ideas in all_ideas.items(): - unique = [] - for idea in ideas: - title_lower = idea["title"].lower().strip() - if len(set(idea_title_drafts.get(title_lower, []))) <= 1: - unique.append({"title": idea["title"], "description": idea.get("description", "")}) - unique_ideas[name] = unique - - # Find shared vs unique references - ref_drafts: dict[tuple[str, str], list[str]] = {} - for name, refs in all_refs.items(): - for ref in refs: - if ref not in ref_drafts: - ref_drafts[ref] = [] - ref_drafts[ref].append(name) - - shared_refs = [ - {"type": ref[0], "id": ref[1], "drafts": draft_list} - for ref, draft_list in ref_drafts.items() - if len(set(draft_list)) > 1 - ] - unique_refs: dict[str, list[dict]] = {} - for name, refs in all_refs.items(): - unique = [] - for ref in refs: - if len(set(ref_drafts.get(ref, []))) <= 1: - unique.append({"type": ref[0], "id": ref[1]}) - unique_refs[name] = unique - - # Pairwise embedding similarities - embeddings = db.all_embeddings() - similarities = [] - valid_names = [d["name"] for d in drafts_data] - for i in range(len(valid_names)): - for j in range(i + 1, len(valid_names)): - a, b = valid_names[i], valid_names[j] - if a in embeddings and b in embeddings: - vec_a = embeddings[a] - vec_b = embeddings[b] - dot = np.dot(vec_a, vec_b) - norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b) - sim = float(dot / norm) if norm > 0 else 0.0 - similarities.append({"a": a, "b": b, "similarity": round(sim, 4)}) - - return { - "drafts": drafts_data, - "shared_ideas": shared_ideas, - "unique_ideas": unique_ideas, - "shared_refs": shared_refs, - "unique_refs": unique_refs, - "similarities": similarities, - "comparison_text": None, - } - - -# --------------------------------------------------------------------------- -# Architecture Designer — System-of-Systems view -# --------------------------------------------------------------------------- - -# Architectural layers (bottom-up stack) -_ARCH_LAYERS = [ - {"id": "transport", "label": "Transport & Networking", "order": 0, - "keywords": {"transport", "network", "routing", "tunnel", "packet", "flow", "traffic", "qos", "sdwan", "mpls", "bgp", "ospf", "segment", "srv6", "quic", "http", "grpc", "mqtt", "yang", "snmp", "netconf", "restconf"}}, - {"id": "identity", "label": "Identity & Trust", "order": 1, - "keywords": {"identity", "auth", "authentication", "authorization", "credential", "certificate", "trust", "attestation", "oauth", "token", "signing", "verification", "verifiable", "did", "vc", "pki", "spiffe", "acl"}}, - {"id": "discovery", "label": "Discovery & Registration", "order": 2, - "keywords": {"discovery", "registration", "registry", "catalog", "advertisement", "announce", "capability", "service", "lookup", "resolution", "dns", "directory"}}, - {"id": "communication", "label": "Agent Communication", "order": 3, - "keywords": {"a2a", "agent", "communication", "message", "messaging", "protocol", "exchange", "negotiation", "handshake", "session", "dialogue", "interaction", "mcp", "interop"}}, - {"id": "coordination", "label": "Task & Coordination", "order": 4, - "keywords": {"task", "delegation", "orchestration", "workflow", "planning", "coordination", "consensus", "collaboration", "multi-agent", "swarm", "composition", "scheduling"}}, - {"id": "intelligence", "label": "AI & Inference", "order": 5, - "keywords": {"model", "inference", "learning", "training", "ml", "neural", "llm", "embedding", "reasoning", "decision", "prediction", "classification", "generative", "rag", "fine-tuning"}}, - {"id": "safety", "label": "Safety & Governance", "order": 6, - "keywords": {"safety", "ethical", "governance", "policy", "audit", "explainability", "transparency", "accountability", "bias", "fairness", "compliance", "regulation", "risk", "shutdown", "alignment", "adversarial", "privacy", "consent"}}, - {"id": "application", "label": "Application Domains", "order": 7, - "keywords": {"healthcare", "autonomous", "vehicle", "robotics", "iot", "digital twin", "supply chain", "finance", "manufacturing", "energy", "smart", "edge", "cloud", "sensing"}}, -] - -_LAYER_KEYWORDS = {l["id"]: l["keywords"] for l in _ARCH_LAYERS} - - -def _classify_to_layer(text: str) -> str: - """Classify a piece of text to the best-matching architectural layer.""" - text_lower = text.lower() - words = set(re.findall(r"[a-z][a-z0-9-]+", text_lower)) - scores: dict[str, int] = {} - for layer_id, kws in _LAYER_KEYWORDS.items(): - scores[layer_id] = len(words & kws) - # Also check for multi-word keywords as substrings - for kw in kws: - if len(kw) > 4 and kw in text_lower: - scores[layer_id] += 1 - best = max(scores, key=lambda k: scores[k]) - return best if scores[best] > 0 else "communication" # default - - -def get_architecture(db: Database) -> dict: - """Build system-of-systems architecture from idea clusters, gaps, and source coverage.""" - return _cached("architecture", lambda: _compute_architecture(db), ttl=600) - - -def _compute_architecture(db: Database) -> dict: - """Compute the architecture view. - - Returns: - { - "components": [...], # architectural building blocks - "dependencies": [...], # edges between components - "gaps": [...], # gaps mapped to layers - "layers": [...], # layer definitions - "source_coverage": {...}, # per-layer source coverage - "stats": {...} - } - """ - # --- Gather raw data --- - cluster_data = get_idea_clusters(db) - clusters = cluster_data.get("clusters", []) - links = cluster_data.get("links", []) - all_gaps = db.all_gaps() - - # Source coverage: count drafts per source per layer - draft_rows = db.conn.execute( - "SELECT d.name, d.title, d.abstract, d.source, r.categories " - "FROM drafts d LEFT JOIN ratings r ON d.name = r.draft_name " - "WHERE COALESCE(r.false_positive, 0) = 0" - ).fetchall() - - # Build components from idea clusters - components = [] - cluster_to_component: dict[int, int] = {} # cluster_id -> component index - - for cl in clusters: - if cl["size"] < 3: - continue # skip tiny clusters - - # Determine layer from cluster theme + idea titles - text_blob = cl.get("theme", "") - for idea in cl.get("ideas", [])[:10]: - text_blob += " " + idea.get("title", "") + " " + idea.get("description", "") - layer = _classify_to_layer(text_blob) - - # Source coverage for this component's drafts - draft_names = set(cl.get("drafts", [])) - sources: Counter = Counter() - comp_drafts: list[dict] = [] - for dr in draft_rows: - if dr["name"] in draft_names: - sources[dr["source"] or "ietf"] += 1 - comp_drafts.append({"name": dr["name"], "title": (dr["title"] or dr["name"])[:80], "source": dr["source"] or "ietf"}) - - # Idea type breakdown - type_counts: Counter = Counter() - for idea in cl.get("ideas", []): - t = idea.get("type", "") - if t: - type_counts[t] += 1 - - # Maturity: rough proxy from idea count and source diversity - maturity = min(5, 1 + len(sources) + (1 if cl["size"] >= 10 else 0) + (1 if cl.get("cross_wg") else 0)) - - comp = { - "id": len(components), - "cluster_id": cl["id"], - "name": cl.get("theme", f"Component {cl['id']}"), - "layer": layer, - "size": cl["size"], - "draft_count": len(draft_names), - "drafts": comp_drafts[:20], - "sources": dict(sources.most_common()), - "type_breakdown": dict(type_counts.most_common(5)), - "maturity": maturity, - "wgs": cl.get("wgs", [])[:3], - "top_ideas": [{"title": i["title"], "type": i.get("type", ""), "draft_name": i.get("draft_name", "")} - for i in cl.get("ideas", [])[:5]], - "categories": cl.get("categories", []), - } - cluster_to_component[cl["id"]] = comp["id"] - components.append(comp) - - # Build dependencies from cross-cluster links - dependencies = [] - for link in links: - src_comp = cluster_to_component.get(link["source"]) - tgt_comp = cluster_to_component.get(link["target"]) - if src_comp is not None and tgt_comp is not None and src_comp != tgt_comp: - dependencies.append({ - "source": src_comp, - "target": tgt_comp, - "similarity": link.get("best_pair_sim", link.get("similarity", 0)), - "idea_a": link.get("idea_a", ""), - "idea_b": link.get("idea_b", ""), - }) - - # Map gaps to layers - gap_items = [] - for gap in all_gaps: - text = gap["topic"] + " " + gap.get("description", "") + " " + gap.get("category", "") - layer = _classify_to_layer(text) - gap_items.append({ - "id": gap["id"], - "topic": gap["topic"], - "description": gap["description"], - "evidence": gap.get("evidence", ""), - "severity": gap.get("severity", "medium"), - "category": gap.get("category", ""), - "layer": layer, - }) - - # Source coverage per layer - source_coverage: dict[str, dict[str, int]] = {l["id"]: Counter() for l in _ARCH_LAYERS} - for dr in draft_rows: - text = (dr["title"] or "") + " " + (dr["abstract"] or "")[:200] - layer = _classify_to_layer(text) - source_coverage[layer][dr["source"] or "ietf"] += 1 - # Convert Counters to dicts - source_coverage = {k: dict(v) for k, v in source_coverage.items()} - - # Layer summary stats - layer_info = [] - for l in _ARCH_LAYERS: - lid = l["id"] - comp_count = sum(1 for c in components if c["layer"] == lid) - idea_count = sum(c["size"] for c in components if c["layer"] == lid) - gap_count = sum(1 for g in gap_items if g["layer"] == lid) - layer_info.append({ - "id": l["id"], - "label": l["label"], - "order": l["order"], - "component_count": comp_count, - "idea_count": idea_count, - "gap_count": gap_count, - "coverage": source_coverage.get(lid, {}), - "total_drafts": sum(source_coverage.get(lid, {}).values()), - }) - - return { - "components": components, - "dependencies": dependencies, - "gaps": gap_items, - "layers": layer_info, - "stats": { - "total_components": len(components), - "total_dependencies": len(dependencies), - "total_gaps": len(gap_items), - "layers_with_gaps": len(set(g["layer"] for g in gap_items)), - }, - } - - -def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: - """Search-only (free) — returns sources + cached answer if available.""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.search_only(question, top_k=top_k) - - -def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict: - """Run Claude synthesis (costs tokens, result is cached permanently).""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.ask(question, top_k=top_k, cheap=cheap) - - -# ── New Analysis Functions ────────────────────────────────────────────── - -def get_idea_analysis(db: Database) -> dict: - """Return comprehensive idea analysis data for the idea-analysis page. - - Includes novelty distribution, type breakdown with avg novelty, - top novel ideas, ideas-per-draft distribution, cross-tab of type x source, - shared ideas across drafts, and idea novelty vs draft rating correlation. - """ - from collections import Counter, defaultdict - from difflib import SequenceMatcher - - # Fetch raw data - all_ideas = db.conn.execute( - """SELECT i.id, i.draft_name, i.title, i.description, i.idea_type, - i.novelty_score - FROM ideas i ORDER BY i.novelty_score DESC NULLS LAST""" - ).fetchall() - all_ideas = [dict(r) for r in all_ideas] - - # Draft ratings lookup - ratings_rows = db.conn.execute( - """SELECT d.name, d.title as draft_title, d.source, - r.novelty AS r_novelty, r.maturity, r.overlap, r.momentum, r.relevance - FROM drafts d LEFT JOIN ratings r ON d.name = r.draft_name""" - ).fetchall() - draft_info = {} - for r in ratings_rows: - row = dict(r) - # Compute composite score (average of 5 dimensions) - dims = [row.get("r_novelty"), row.get("maturity"), row.get("overlap"), - row.get("momentum"), row.get("relevance")] - valid = [d for d in dims if d is not None] - row["composite_score"] = sum(valid) / len(valid) if valid else None - draft_info[row["name"]] = row - - total = len(all_ideas) - scored = [i for i in all_ideas if i.get("novelty_score") is not None] - unscored = total - len(scored) - avg_novelty = sum(i["novelty_score"] for i in scored) / len(scored) if scored else 0 - - # Embedding coverage - embed_count = db.conn.execute("SELECT COUNT(*) FROM idea_embeddings").fetchone()[0] - - # --- Novelty score distribution (histogram) --- - novelty_dist = Counter(i["novelty_score"] for i in scored) - novelty_histogram = { - "labels": [1, 2, 3, 4, 5], - "values": [novelty_dist.get(s, 0) for s in [1, 2, 3, 4, 5]], - } - - # --- Ideas by type with counts and avg novelty --- - type_data = defaultdict(lambda: {"count": 0, "novelty_sum": 0, "novelty_n": 0}) - for idea in all_ideas: - t = idea.get("idea_type") or "other" - type_data[t]["count"] += 1 - if idea.get("novelty_score") is not None: - type_data[t]["novelty_sum"] += idea["novelty_score"] - type_data[t]["novelty_n"] += 1 - - by_type = [] - for t, d in sorted(type_data.items(), key=lambda x: x[1]["count"], reverse=True): - avg = d["novelty_sum"] / d["novelty_n"] if d["novelty_n"] > 0 else 0 - by_type.append({"type": t, "count": d["count"], "avg_novelty": round(avg, 2)}) - - type_names = [t["type"] for t in by_type] - - # --- Top 20 most novel ideas (score 4-5) --- - top_novel = [] - for idea in all_ideas: - if idea.get("novelty_score") and idea["novelty_score"] >= 4: - di = draft_info.get(idea["draft_name"], {}) - top_novel.append({ - "title": idea["title"], - "description": idea["description"], - "type": idea.get("idea_type", "other"), - "novelty_score": idea["novelty_score"], - "draft_name": idea["draft_name"], - "draft_title": di.get("draft_title", ""), - "draft_score": di.get("composite_score"), - }) - top_novel.sort(key=lambda x: (x["novelty_score"], x.get("draft_score") or 0), reverse=True) - top_novel = top_novel[:20] - - # --- Ideas per draft distribution --- - ideas_per_draft = Counter(i["draft_name"] for i in all_ideas) - ipd_dist = Counter(ideas_per_draft.values()) - ideas_per_draft_hist = { - "labels": sorted(ipd_dist.keys()), - "values": [ipd_dist[k] for k in sorted(ipd_dist.keys())], - } - # Also top drafts by idea count - top_idea_drafts = [] - for name, count in ideas_per_draft.most_common(10): - di = draft_info.get(name, {}) - top_idea_drafts.append({ - "name": name, - "draft_title": di.get("draft_title", ""), - "idea_count": count, - "score": di.get("composite_score"), - }) - - # --- Cross-tabulation: idea_type x source --- - type_source = defaultdict(lambda: defaultdict(int)) - for idea in all_ideas: - t = idea.get("idea_type") or "other" - di = draft_info.get(idea["draft_name"], {}) - source = di.get("source", "ietf") or "ietf" - type_source[t][source] += 1 - - sources = sorted(set( - di.get("source", "ietf") or "ietf" for di in draft_info.values() - )) - cross_tab = [] - for t in type_names: - row = {"type": t} - for s in sources: - row[s] = type_source[t].get(s, 0) - cross_tab.append(row) - - # --- Shared ideas across drafts --- - idea_groups: list[dict] = [] - for idea in all_ideas: - title_lower = idea["title"].lower().strip() - matched = False - for group in idea_groups: - ratio = SequenceMatcher(None, title_lower, group["canonical"]).ratio() - if ratio >= 0.75: - group["ideas"].append(idea) - group["drafts"].add(idea["draft_name"]) - matched = True - break - if not matched: - idea_groups.append({ - "canonical": title_lower, - "title": idea["title"], - "ideas": [idea], - "drafts": {idea["draft_name"]}, - }) - - shared_ideas = [] - for g in sorted(idea_groups, key=lambda x: len(x["drafts"]), reverse=True): - if len(g["drafts"]) < 2: - break - shared_ideas.append({ - "title": g["title"], - "appearances": len(g["drafts"]), - "drafts": sorted(g["drafts"])[:8], - "types": list(set(i.get("idea_type", "other") for i in g["ideas"])), - }) - - # --- Scatter: draft avg idea novelty vs draft relevance --- - draft_idea_novelty = defaultdict(list) - for idea in scored: - draft_idea_novelty[idea["draft_name"]].append(idea["novelty_score"]) - - scatter_data = [] - for name, scores in draft_idea_novelty.items(): - di = draft_info.get(name, {}) - if di.get("relevance") is not None and di.get("composite_score") is not None: - scatter_data.append({ - "name": name, - "avg_idea_novelty": round(sum(scores) / len(scores), 2), - "relevance": di["relevance"], - "score": di["composite_score"], - "idea_count": len(scores), - "source": di.get("source", "ietf") or "ietf", - }) - - # --- Sunburst data: type -> novelty band --- - sunburst_labels = [] - sunburst_parents = [] - sunburst_values = [] - # Root - sunburst_labels.append("All Ideas") - sunburst_parents.append("") - sunburst_values.append(total) - - novelty_bands = {"High (4-5)": lambda s: s is not None and s >= 4, - "Medium (3)": lambda s: s is not None and s == 3, - "Low (1-2)": lambda s: s is not None and s <= 2, - "Unscored": lambda s: s is None} - - for t_info in by_type: - t = t_info["type"] - sunburst_labels.append(t) - sunburst_parents.append("All Ideas") - sunburst_values.append(t_info["count"]) - # Sub-bands - type_ideas = [i for i in all_ideas if (i.get("idea_type") or "other") == t] - for band, fn in novelty_bands.items(): - cnt = sum(1 for i in type_ideas if fn(i.get("novelty_score"))) - if cnt > 0: - sunburst_labels.append(f"{t} - {band}") - sunburst_parents.append(t) - sunburst_values.append(cnt) - - return { - "total": total, - "scored": len(scored), - "unscored": unscored, - "avg_novelty": round(avg_novelty, 2), - "embed_count": embed_count, - "embed_pct": round(embed_count / total * 100, 1) if total > 0 else 0, - "type_count": len(by_type), - "novelty_histogram": novelty_histogram, - "by_type": by_type, - "top_novel": top_novel, - "ideas_per_draft_hist": ideas_per_draft_hist, - "top_idea_drafts": top_idea_drafts, - "cross_tab": cross_tab, - "sources": sources, - "shared_ideas": shared_ideas, - "scatter_data": scatter_data, - "sunburst": { - "labels": sunburst_labels, - "parents": sunburst_parents, - "values": sunburst_values, - }, - } - - - - -def get_source_comparison(db: Database) -> dict: - """Cross-source comparison: ratings, categories, counts by standards body.""" - pairs_all = db.drafts_with_ratings(limit=2000) - # Also include false positives for completeness of source counts - pairs_fp = db.drafts_with_ratings(limit=2000, include_false_positives=True) - - # Build per-source data - source_stats: dict[str, dict] = {} - source_categories: dict[str, Counter] = defaultdict(Counter) - source_ratings: dict[str, dict[str, list]] = defaultdict(lambda: { - "novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": [], "scores": [], - }) - # Collect author counts per source - all_authors_by_source: dict[str, set] = defaultdict(set) - - for draft, rating in pairs_all: - src = getattr(draft, "source", "ietf") or "ietf" - source_ratings[src]["novelty"].append(rating.novelty) - source_ratings[src]["maturity"].append(rating.maturity) - source_ratings[src]["overlap"].append(rating.overlap) - source_ratings[src]["momentum"].append(rating.momentum) - source_ratings[src]["relevance"].append(rating.relevance) - source_ratings[src]["scores"].append(round(rating.composite_score, 2)) - for cat in rating.categories: - source_categories[src][cat] += 1 - - # Get all drafts (including unrated) for draft counts - all_drafts = db.list_drafts(limit=5000) - source_draft_counts: Counter = Counter() - for d in all_drafts: - src = getattr(d, "source", "ietf") or "ietf" - source_draft_counts[src] += 1 - - # Author counts by source - try: - rows = db.conn.execute( - """SELECT d.source, COUNT(DISTINCT da.person_id) as author_count - FROM drafts d - JOIN draft_authors da ON d.name = da.draft_name - GROUP BY d.source""" - ).fetchall() - for r in rows: - src = r["source"] or "ietf" - all_authors_by_source[src] = r["author_count"] - except Exception: - pass - - # Idea counts by source - source_idea_counts: Counter = Counter() - try: - rows = db.conn.execute( - """SELECT d.source, COUNT(*) as idea_count - FROM ideas i - JOIN drafts d ON i.draft_name = d.name - GROUP BY d.source""" - ).fetchall() - for r in rows: - src = r["source"] or "ietf" - source_idea_counts[src] = r["idea_count"] - except Exception: - pass - - # Build summary table - all_sources = sorted(set(source_draft_counts.keys()) | set(source_ratings.keys())) - summary = [] - for src in all_sources: - rats = source_ratings.get(src, {"scores": []}) - cats = source_categories.get(src, Counter()) - top_cat = cats.most_common(1)[0][0] if cats else "N/A" - avg_score = round(sum(rats["scores"]) / len(rats["scores"]), 2) if rats["scores"] else 0.0 - summary.append({ - "source": src, - "drafts": source_draft_counts.get(src, 0), - "rated": len(rats["scores"]), - "authors": all_authors_by_source.get(src, 0), - "ideas": source_idea_counts.get(src, 0), - "avg_score": avg_score, - "top_category": top_cat, - }) - - # Radar data: average of each dimension per source - radar = {} - for src, rats in source_ratings.items(): - if not rats["scores"]: - continue - n = len(rats["scores"]) - radar[src] = { - "novelty": round(sum(rats["novelty"]) / n, 2), - "maturity": round(sum(rats["maturity"]) / n, 2), - "overlap": round(sum(rats["overlap"]) / n, 2), - "momentum": round(sum(rats["momentum"]) / n, 2), - "relevance": round(sum(rats["relevance"]) / n, 2), - "count": n, - } - - # Category distribution by source (for stacked bar / heatmap) - all_cats = sorted({cat for cats in source_categories.values() for cat in cats}) - heatmap = { - "sources": list(source_categories.keys()), - "categories": all_cats, - "values": [], - } - for src in heatmap["sources"]: - row = [source_categories[src].get(cat, 0) for cat in all_cats] - heatmap["values"].append(row) - - # Unique/shared categories analysis - source_cat_sets = {src: set(cats.keys()) for src, cats in source_categories.items()} - unique_cats = {} - for src, cats in source_cat_sets.items(): - others = set() - for s2, c2 in source_cat_sets.items(): - if s2 != src: - others |= c2 - unique_cats[src] = sorted(cats - others) - - shared_cats = set() - for src, cats in source_cat_sets.items(): - for s2, c2 in source_cat_sets.items(): - if s2 != src: - shared_cats |= (cats & c2) - shared_cats = sorted(shared_cats) - - return { - "summary": summary, - "radar": radar, - "heatmap": heatmap, - "unique_categories": unique_cats, - "shared_categories": shared_cats, - } - - -def get_false_positive_profile(db: Database) -> dict: - """Profile drafts flagged as false positives.""" - # Get false positives - fp_rows = db.false_positive_drafts_raw() - - # Get non-FP rated drafts for comparison - nonfp_rows = db.non_false_positive_ratings_raw() - - total_rated = db.rated_count() - total_drafts = db.count_drafts(include_false_positives=True) - - # Build FP list - fp_list = [] - fp_categories: Counter = Counter() - fp_sources: Counter = Counter() - fp_dims = {"novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": []} - - for row in fp_rows: - cats = json.loads(row["r_categories"]) if row["r_categories"] else [] - src = row["source"] or "ietf" - fp_list.append({ - "name": row["name"], - "title": row["title"], - "source": src, - "categories": cats, - "relevance": row["relevance"], - "novelty": row["novelty"], - "maturity": row["maturity"], - "overlap": row["overlap"], - "momentum": row["momentum"], - "summary": row["summary"] or "", - }) - for cat in cats: - fp_categories[cat] += 1 - fp_sources[src] += 1 - fp_dims["novelty"].append(row["novelty"]) - fp_dims["maturity"].append(row["maturity"]) - fp_dims["overlap"].append(row["overlap"]) - fp_dims["momentum"].append(row["momentum"]) - fp_dims["relevance"].append(row["relevance"]) - - # Non-FP dimensions for comparison - nonfp_dims = {"novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": []} - nonfp_categories: Counter = Counter() - for row in nonfp_rows: - nonfp_dims["novelty"].append(row["novelty"]) - nonfp_dims["maturity"].append(row["maturity"]) - nonfp_dims["overlap"].append(row["overlap"]) - nonfp_dims["momentum"].append(row["momentum"]) - nonfp_dims["relevance"].append(row["relevance"]) - cats = json.loads(row["r_categories"]) if row["r_categories"] else [] - for cat in cats: - nonfp_categories[cat] += 1 - - # Top terms from FP abstracts - from collections import Counter as _Counter - stop_words = { - "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", - "of", "with", "by", "from", "is", "it", "that", "this", "are", "was", - "be", "as", "can", "may", "will", "not", "has", "have", "been", "which", - "their", "its", "also", "such", "these", "would", "should", "could", - "more", "other", "than", "into", "about", "between", "over", "after", - "all", "one", "two", "new", "they", "we", "our", "each", "some", "any", - "there", "what", "when", "how", "where", "who", "does", "do", "did", - "no", "if", "so", "up", "out", "only", "used", "using", "use", "based", - "through", "both", "well", "within", "must", "while", "had", "were", - } - word_counter: Counter = Counter() - for row in fp_rows: - abstract = (row["abstract"] or "").lower() - title = (row["title"] or "").lower() - text = abstract + " " + title - words = re.findall(r'[a-z]{3,}', text) - for w in words: - if w not in stop_words: - word_counter[w] += 1 - top_terms = word_counter.most_common(30) - - return { - "count": len(fp_list), - "total_rated": total_rated, - "total_drafts": total_drafts, - "pct_of_total": round(100 * len(fp_list) / total_drafts, 1) if total_drafts else 0, - "pct_of_rated": round(100 * len(fp_list) / total_rated, 1) if total_rated else 0, - "fp_list": fp_list, - "fp_categories": dict(fp_categories.most_common()), - "fp_sources": dict(fp_sources.most_common()), - "fp_dims": fp_dims, - "nonfp_dims": nonfp_dims, - "top_terms": top_terms, - "nonfp_categories": dict(nonfp_categories.most_common(20)), - } - - -def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: - """Search-only (free) — returns sources + cached answer if available.""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.search_only(question, top_k=top_k) - - -def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict: - """Run Claude synthesis (costs tokens, result is cached permanently).""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.ask(question, top_k=top_k, cheap=cheap) - - -def get_citation_influence(db: Database) -> dict: - """Return citation influence analysis data (cached for 5 min).""" - return _cached("citation_influence", lambda: _compute_citation_influence(db)) - - -def _compute_citation_influence(db: Database) -> dict: - """Compute citation influence metrics from the draft_refs table. - - Returns dict with: - - top_cited_rfcs: top 20 most-cited RFCs with citation counts and citing drafts - - top_citing_drafts: top 20 drafts that cite the most references - - citations_by_category: average citations per category - - stats: total citations, unique RFCs, avg refs per draft - - draft_network: draft-to-draft citation edges for visualization - """ - # Get all references - rows = db.conn.execute( - "SELECT draft_name, ref_type, ref_id FROM draft_refs" - ).fetchall() - - # Get draft titles and categories - draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() - draft_titles = {r["name"]: r["title"] for r in draft_rows} - - rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() - draft_cats: dict[str, str] = {} - for r in rating_rows: - try: - cats = json.loads(r["categories"]) if r["categories"] else [] - draft_cats[r["draft_name"]] = cats[0] if cats else "Other" - except Exception: - draft_cats[r["draft_name"]] = "Other" - - # Well-known RFC names - rfc_names = { - "2119": "Key words (MUST/SHALL/MAY)", "8174": "Key words update", - "8259": "JSON", "7519": "JWT", "6749": "OAuth 2.0", - "7540": "HTTP/2", "9110": "HTTP Semantics", "7525": "TLS Recommendations", - "8446": "TLS 1.3", "3986": "URIs", "7230": "HTTP/1.1 Syntax", - "7231": "HTTP/1.1 Semantics", "8288": "Web Linking", "6125": "TLS Server Identity", - "7515": "JWS", "7516": "JWE", "7517": "JWK", "7518": "JWA", - "9449": "DPoP", "6750": "OAuth Bearer", "8725": "JWT Best Practices", - "9396": "Rich Authorization Requests", "9101": "JAR", - "8414": "OAuth Server Metadata", "7591": "Dynamic Client Registration", - "8705": "mTLS for OAuth", "9068": "JWT Access Tokens", - "6819": "OAuth Threat Model", "9200": "ACE-OAuth", "9052": "COSE", - "8392": "CWT", "7252": "CoAP", - } - - # In-degree: how many times each RFC is cited - rfc_citations: dict[str, list[str]] = defaultdict(list) - draft_out_count: dict[str, int] = Counter() - draft_to_draft_edges = [] - total_citations = 0 - - for r in rows: - draft_name = r["draft_name"] - ref_type = r["ref_type"] - ref_id = r["ref_id"] - total_citations += 1 - draft_out_count[draft_name] += 1 - - if ref_type == "rfc": - rfc_citations[ref_id].append(draft_name) - elif ref_type == "draft": - draft_to_draft_edges.append({ - "source": draft_name, - "target": ref_id, - "source_title": draft_titles.get(draft_name, draft_name), - "target_title": draft_titles.get(ref_id, ref_id), - }) - - # Top 20 most-cited RFCs - rfc_sorted = sorted(rfc_citations.items(), key=lambda x: len(x[1]), reverse=True) - top_cited_rfcs = [] - for ref_id, citing_drafts in rfc_sorted[:20]: - top_cited_rfcs.append({ - "rfc_id": ref_id, - "name": rfc_names.get(ref_id, ""), - "count": len(citing_drafts), - "drafts": citing_drafts[:10], # Limit to first 10 for display - "total_drafts": len(citing_drafts), - }) - - # Top 20 most-citing drafts (out-degree) - draft_sorted = sorted(draft_out_count.items(), key=lambda x: x[1], reverse=True) - top_citing_drafts = [] - for draft_name, count in draft_sorted[:20]: - top_citing_drafts.append({ - "name": draft_name, - "title": draft_titles.get(draft_name, draft_name), - "count": count, - "category": draft_cats.get(draft_name, "Other"), - }) - - # Citation density by category - cat_totals: dict[str, int] = Counter() - cat_counts: dict[str, int] = Counter() - for draft_name, count in draft_out_count.items(): - cat = draft_cats.get(draft_name, "Other") - cat_totals[cat] += count - cat_counts[cat] += 1 - - citations_by_category = [] - for cat in sorted(cat_totals.keys()): - avg = cat_totals[cat] / cat_counts[cat] if cat_counts[cat] > 0 else 0 - citations_by_category.append({ - "category": cat, - "total_citations": cat_totals[cat], - "draft_count": cat_counts[cat], - "avg_citations": round(avg, 1), - }) - citations_by_category.sort(key=lambda x: x["avg_citations"], reverse=True) - - # PageRank-style influence: drafts that cite highly-cited RFCs - # Simple approximation: sum of (1 / citation_count) for each RFC cited - rfc_influence = {rid: len(drafts) for rid, drafts in rfc_citations.items()} - draft_pagerank: dict[str, float] = Counter() - for r in rows: - if r["ref_type"] == "rfc" and r["ref_id"] in rfc_influence: - # Higher score for citing highly-cited RFCs - draft_pagerank[r["draft_name"]] += rfc_influence[r["ref_id"]] - - pagerank_sorted = sorted(draft_pagerank.items(), key=lambda x: x[1], reverse=True) - top_pagerank = [] - for draft_name, score in pagerank_sorted[:20]: - top_pagerank.append({ - "name": draft_name, - "title": draft_titles.get(draft_name, draft_name), - "score": round(score, 1), - "category": draft_cats.get(draft_name, "Other"), - "out_degree": draft_out_count.get(draft_name, 0), - }) - - # Stats - unique_rfcs = len(rfc_citations) - drafts_with_refs = len(draft_out_count) - avg_refs = total_citations / drafts_with_refs if drafts_with_refs > 0 else 0 - - return { - "top_cited_rfcs": top_cited_rfcs, - "top_citing_drafts": top_citing_drafts, - "top_pagerank": top_pagerank, - "citations_by_category": citations_by_category, - "draft_network": draft_to_draft_edges[:200], # Limit for perf - "stats": { - "total_citations": total_citations, - "unique_rfcs": unique_rfcs, - "drafts_with_refs": drafts_with_refs, - "avg_refs_per_draft": round(avg_refs, 1), - }, - } - - -def get_bcp_analysis(db: Database) -> dict: - """Return BCP dependency analysis data (cached for 5 min).""" - return _cached("bcp_analysis", lambda: _compute_bcp_analysis(db)) - - -def _compute_bcp_analysis(db: Database) -> dict: - """Compute BCP dependency analysis. - - Returns dict with: - - bcps: all BCPs with citation counts and citing drafts - - co_citation: which BCPs tend to be co-cited - - by_category: BCP citation patterns by category - - coverage: what % of drafts cite at least one BCP - """ - # Get all BCP references - bcp_rows = db.conn.execute( - "SELECT draft_name, ref_id FROM draft_refs WHERE ref_type = 'bcp'" - ).fetchall() - - # Get draft titles and categories - draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() - draft_titles = {r["name"]: r["title"] for r in draft_rows} - total_drafts = len(draft_titles) - - rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() - draft_cats: dict[str, str] = {} - for r in rating_rows: - try: - cats = json.loads(r["categories"]) if r["categories"] else [] - draft_cats[r["draft_name"]] = cats[0] if cats else "Other" - except Exception: - draft_cats[r["draft_name"]] = "Other" - - # BCP citation counts - bcp_citations: dict[str, list[str]] = defaultdict(list) - draft_bcps: dict[str, list[str]] = defaultdict(list) - - for r in bcp_rows: - bcp_citations[r["ref_id"]].append(r["draft_name"]) - draft_bcps[r["draft_name"]].append(r["ref_id"]) - - # All BCPs with counts - bcps = [] - for bcp_id, citing_drafts in sorted(bcp_citations.items(), - key=lambda x: len(x[1]), reverse=True): - bcps.append({ - "bcp_id": bcp_id, - "count": len(citing_drafts), - "drafts": citing_drafts[:10], - "total_drafts": len(citing_drafts), - }) - - # Co-citation matrix: which BCPs appear together in the same draft - bcp_ids = sorted(bcp_citations.keys()) - co_citation = [] - for i, bcp_a in enumerate(bcp_ids): - drafts_a = set(bcp_citations[bcp_a]) - for j, bcp_b in enumerate(bcp_ids): - if j <= i: - continue - drafts_b = set(bcp_citations[bcp_b]) - shared = len(drafts_a & drafts_b) - if shared > 0: - co_citation.append({ - "bcp_a": bcp_a, - "bcp_b": bcp_b, - "count": shared, - }) - - # Heatmap data: full matrix for all BCPs (top 20 by citation count) - top_bcp_ids = [b["bcp_id"] for b in bcps[:20]] - heatmap_matrix = [] - for bcp_a in top_bcp_ids: - row = [] - drafts_a = set(bcp_citations.get(bcp_a, [])) - for bcp_b in top_bcp_ids: - drafts_b = set(bcp_citations.get(bcp_b, [])) - shared = len(drafts_a & drafts_b) - row.append(shared) - heatmap_matrix.append(row) - - # BCP citations by category - cat_bcp_count: dict[str, Counter] = defaultdict(Counter) - for draft_name, bcp_list in draft_bcps.items(): - cat = draft_cats.get(draft_name, "Other") - for bcp_id in bcp_list: - cat_bcp_count[cat][bcp_id] += 1 - - by_category = [] - for cat in sorted(cat_bcp_count.keys()): - top_bcps = cat_bcp_count[cat].most_common(5) - by_category.append({ - "category": cat, - "total_bcp_refs": sum(cat_bcp_count[cat].values()), - "unique_bcps": len(cat_bcp_count[cat]), - "top_bcps": [{"bcp_id": bid, "count": c} for bid, c in top_bcps], - }) - by_category.sort(key=lambda x: x["total_bcp_refs"], reverse=True) - - # Coverage - drafts_with_bcp = len(draft_bcps) - coverage_pct = (drafts_with_bcp / total_drafts * 100) if total_drafts > 0 else 0 - - return { - "bcps": bcps, - "co_citation": co_citation, - "heatmap_labels": top_bcp_ids, - "heatmap_matrix": heatmap_matrix, - "by_category": by_category, - "coverage": { - "total_drafts": total_drafts, - "drafts_with_bcp": drafts_with_bcp, - "coverage_pct": round(coverage_pct, 1), - "unique_bcps": len(bcp_citations), - "total_bcp_refs": len(bcp_rows), - }, - } - - -def global_search(db: Database, query: str) -> SearchResults: - """Search across drafts (FTS5), ideas, authors, and gaps. - - Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}. - """ - results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []} - if not query or not query.strip(): - return results - - q = query.strip() - - # 1. Drafts via FTS5 - try: - fts_query = re.sub(r'[^\w\s]', '', q) - fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE) - fts_query = re.sub(r'\s+', ' ', fts_query).strip() - if not fts_query: - raise ValueError("empty query after sanitization") - rows = db.conn.execute( - """SELECT d.name, d.title, d.abstract, d.time, d."group" - FROM drafts d - JOIN drafts_fts f ON d.rowid = f.rowid - WHERE drafts_fts MATCH ? - ORDER BY rank - LIMIT 50""", - (fts_query,), - ).fetchall() - for r in rows: - results["drafts"].append({ - "name": r["name"], - "title": r["title"], - "abstract": (r["abstract"] or "")[:200], - "date": r["time"], - "group": r["group"] or "individual", - }) - except Exception: - # FTS5 match can fail on certain query syntax; fall back to LIKE - like = f"%{q}%" - rows = db.conn.execute( - """SELECT name, title, abstract, time, "group" FROM drafts - WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ? - LIMIT 50""", - (like, like, like), - ).fetchall() - for r in rows: - results["drafts"].append({ - "name": r["name"], - "title": r["title"], - "abstract": (r["abstract"] or "")[:200], - "date": r["time"], - "group": r["group"] or "individual", - }) - - # 2. Ideas via LIKE - like = f"%{q}%" - rows = db.conn.execute( - """SELECT id, title, description, idea_type, draft_name FROM ideas - WHERE (title LIKE ? OR description LIKE ?) - AND draft_name NOT IN (SELECT draft_name FROM ratings WHERE false_positive = 1) - ORDER BY id LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["ideas"].append({ - "id": r["id"], - "title": r["title"], - "description": (r["description"] or "")[:200], - "type": r["idea_type"], - "draft_name": r["draft_name"], - }) - - # 3. Authors via LIKE - results["authors"] = db.search_authors(q, limit=50) - - # 4. Gaps via LIKE - results["gaps"] = db.search_gaps(q, limit=50) - - return results - - -def get_landscape_tsne(db: Database) -> list[dict]: - """Compute t-SNE (cached for 5 min).""" - return _cached("landscape_tsne", lambda: _compute_landscape_tsne(db)) - - -def _compute_landscape_tsne(db: Database) -> list[dict]: - """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].""" - - - embeddings = db.all_embeddings() - if len(embeddings) < 5: - return [] - - pairs = db.drafts_with_ratings(limit=1000) - rating_map = {d.name: r for d, r in pairs} - draft_map = {d.name: d for d, _ in pairs} - - # Filter to drafts that have both embeddings and ratings - names = [n for n in embeddings if n in rating_map] - if len(names) < 5: - return [] - - matrix = np.array([embeddings[n] for n in names]) - - try: - tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), - random_state=42, max_iter=500) - coords = tsne.fit_transform(matrix) - except Exception: - return [] - - result = [] - for i, name in enumerate(names): - r = rating_map[name] - d = draft_map.get(name) - result.append({ - "name": name, - "title": d.title if d else name, - "x": round(float(coords[i, 0]), 3), - "y": round(float(coords[i, 1]), 3), - "category": r.categories[0] if r.categories else "Other", - "score": round(r.composite_score, 2), - }) - return result - - -def get_comparison_data(db: Database, names: list[str]) -> dict | None: - """Get comparison data for a list of drafts. - - Returns { - drafts: [{name, title, abstract, rating, ideas, refs, ...}], - shared_ideas: [{title, drafts: [name,...]}], - unique_ideas: {name: [{title, description}]}, - shared_refs: [{type, id, drafts: [name,...]}], - unique_refs: {name: [{type, id}]}, - similarities: [{a, b, similarity}], - comparison_text: str | None, - } - """ - - - drafts_data = [] - all_ideas: dict[str, list[dict]] = {} - all_refs: dict[str, list[tuple[str, str]]] = {} - - for name in names: - detail = get_draft_detail(db, name) - if not detail: - continue - drafts_data.append(detail) - all_ideas[name] = detail.get("ideas", []) - all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])] - - if len(drafts_data) < 2: - return None - - # Find shared vs unique ideas (by title similarity) - idea_title_drafts: dict[str, list[str]] = {} - for name, ideas in all_ideas.items(): - for idea in ideas: - title_lower = idea["title"].lower().strip() - if title_lower not in idea_title_drafts: - idea_title_drafts[title_lower] = [] - idea_title_drafts[title_lower].append(name) - - shared_ideas = [ - {"title": title, "drafts": draft_list} - for title, draft_list in idea_title_drafts.items() - if len(set(draft_list)) > 1 - ] - unique_ideas: dict[str, list[dict]] = {} - for name, ideas in all_ideas.items(): - unique = [] - for idea in ideas: - title_lower = idea["title"].lower().strip() - if len(set(idea_title_drafts.get(title_lower, []))) <= 1: - unique.append({"title": idea["title"], "description": idea.get("description", "")}) - unique_ideas[name] = unique - - # Find shared vs unique references - ref_drafts: dict[tuple[str, str], list[str]] = {} - for name, refs in all_refs.items(): - for ref in refs: - if ref not in ref_drafts: - ref_drafts[ref] = [] - ref_drafts[ref].append(name) - - shared_refs = [ - {"type": ref[0], "id": ref[1], "drafts": draft_list} - for ref, draft_list in ref_drafts.items() - if len(set(draft_list)) > 1 - ] - unique_refs: dict[str, list[dict]] = {} - for name, refs in all_refs.items(): - unique = [] - for ref in refs: - if len(set(ref_drafts.get(ref, []))) <= 1: - unique.append({"type": ref[0], "id": ref[1]}) - unique_refs[name] = unique - - # Pairwise embedding similarities - embeddings = db.all_embeddings() - similarities = [] - valid_names = [d["name"] for d in drafts_data] - for i in range(len(valid_names)): - for j in range(i + 1, len(valid_names)): - a, b = valid_names[i], valid_names[j] - if a in embeddings and b in embeddings: - vec_a = embeddings[a] - vec_b = embeddings[b] - dot = np.dot(vec_a, vec_b) - norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b) - sim = float(dot / norm) if norm > 0 else 0.0 - similarities.append({"a": a, "b": b, "similarity": round(sim, 4)}) - - return { - "drafts": drafts_data, - "shared_ideas": shared_ideas, - "unique_ideas": unique_ideas, - "shared_refs": shared_refs, - "unique_refs": unique_refs, - "similarities": similarities, - "comparison_text": None, - } - - -def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: - """Search-only (free) — returns sources + cached answer if available.""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.search_only(question, top_k=top_k) - - -def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict: - """Run Claude synthesis (costs tokens, result is cached permanently).""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.ask(question, top_k=top_k, cheap=cheap) - - -SAFETY_CATEGORIES = {"AI safety/alignment", "Agent identity/auth", "Policy/governance"} -CAPABILITY_CATEGORIES = {"A2A protocols", "Agent discovery/reg", "Autonomous netops", - "Data formats/interop", "Human-agent interaction", "Model serving/inference"} - - -def get_trends_data(db: Database) -> dict: - """Return temporal evolution data for the /trends page. - - Returns dict with: - - monthly_submissions: [{month, source, count}, ...] - - monthly_ratings: [{month, novelty, maturity, overlap, momentum, relevance}, ...] - - monthly_categories: [{month, category, count}, ...] - - safety_ratio: [{month, safety, capability, ratio}, ...] - - cumulative_ideas: [{month, total}, ...] - - monthly_new_authors: [{month, count}, ...] - - stats: {fastest_growing, newest_active} - - monthly_table: [{month, total, sources: {}, avg_score}, ...] - """ - conn = db.conn - - # 1. Monthly submissions by source - rows = conn.execute(""" - SELECT substr(time, 1, 7) AS month, source, COUNT(*) AS cnt - FROM drafts - WHERE time IS NOT NULL AND time != '' - GROUP BY month, source - ORDER BY month - """).fetchall() - monthly_submissions = [{"month": r["month"], "source": r["source"], "count": r["cnt"]} for r in rows] - - # 2. Monthly average ratings (all 5 dimensions) - rows = conn.execute(""" - SELECT substr(d.time, 1, 7) AS month, - AVG(r.novelty) AS novelty, AVG(r.maturity) AS maturity, - AVG(r.overlap) AS overlap, AVG(r.momentum) AS momentum, - AVG(r.relevance) AS relevance, - COUNT(*) AS cnt - FROM drafts d - JOIN ratings r ON d.name = r.draft_name - WHERE d.time IS NOT NULL AND d.time != '' AND r.false_positive = 0 - GROUP BY month - ORDER BY month - """).fetchall() - monthly_ratings = [{ - "month": r["month"], - "novelty": round(r["novelty"], 2), - "maturity": round(r["maturity"], 2), - "overlap": round(r["overlap"], 2), - "momentum": round(r["momentum"], 2), - "relevance": round(r["relevance"], 2), - "count": r["cnt"], - } for r in rows] - - # 3. Monthly category distribution - rows = conn.execute(""" - SELECT substr(d.time, 1, 7) AS month, r.categories - FROM drafts d - JOIN ratings r ON d.name = r.draft_name - WHERE d.time IS NOT NULL AND d.time != '' AND r.false_positive = 0 - """).fetchall() - cat_monthly: dict[str, Counter] = defaultdict(Counter) - all_cats: Counter = Counter() - for r in rows: - month = r["month"] - try: - cats = json.loads(r["categories"]) if r["categories"] else [] - except (json.JSONDecodeError, TypeError): - cats = [] - for c in cats: - cat_monthly[month][c] += 1 - all_cats[c] += 1 - - # Top 8 categories - top_cats = [c for c, _ in all_cats.most_common(8)] - months_sorted = sorted(cat_monthly.keys()) - monthly_categories = [] - for month in months_sorted: - for cat in top_cats: - monthly_categories.append({ - "month": month, - "category": cat, - "count": cat_monthly[month].get(cat, 0), - }) - - # 4. Safety ratio over time - safety_ratio = [] - for month in months_sorted: - safety = sum(cat_monthly[month].get(c, 0) for c in SAFETY_CATEGORIES) - capability = sum(cat_monthly[month].get(c, 0) for c in CAPABILITY_CATEGORIES) - ratio = round(safety / capability, 2) if capability > 0 else 0 - safety_ratio.append({ - "month": month, - "safety": safety, - "capability": capability, - "ratio": ratio, - }) - - # 5. Cumulative idea count over time - rows = conn.execute(""" - SELECT substr(d.time, 1, 7) AS month, COUNT(i.id) AS cnt - FROM ideas i - JOIN drafts d ON i.draft_name = d.name - WHERE d.time IS NOT NULL AND d.time != '' - GROUP BY month - ORDER BY month - """).fetchall() - cumulative = 0 - cumulative_ideas = [] - for r in rows: - cumulative += r["cnt"] - cumulative_ideas.append({"month": r["month"], "total": cumulative}) - - # 6. Monthly new author count (first-time contributors) - rows = conn.execute(""" - SELECT da.person_id, MIN(substr(d.time, 1, 7)) AS first_month - FROM draft_authors da - JOIN drafts d ON da.draft_name = d.name - WHERE d.time IS NOT NULL AND d.time != '' - GROUP BY da.person_id - """).fetchall() - new_author_monthly: Counter = Counter() - for r in rows: - if r["first_month"]: - new_author_monthly[r["first_month"]] += 1 - monthly_new_authors = [ - {"month": m, "count": new_author_monthly.get(m, 0)} - for m in months_sorted - ] - - # 7. Stats: fastest growing category, newest active category - fastest_growing = "" - newest_active = "" - if len(months_sorted) >= 4: - mid = len(months_sorted) // 2 - early_months = months_sorted[:mid] - late_months = months_sorted[mid:] - best_growth = -999 - for cat in top_cats: - early = sum(cat_monthly[m].get(cat, 0) for m in early_months) - late = sum(cat_monthly[m].get(cat, 0) for m in late_months) - if early > 0: - growth = (late - early) / early - elif late > 0: - growth = float("inf") - else: - growth = 0 - if growth > best_growth: - best_growth = growth - fastest_growing = cat - - # Newest active: category with latest first appearance - cat_first_month: dict[str, str] = {} - for month in months_sorted: - for cat in all_cats: - if cat not in cat_first_month and cat_monthly[month].get(cat, 0) > 0: - cat_first_month[cat] = month - if cat_first_month: - newest_active = max(cat_first_month, key=lambda c: cat_first_month[c]) - - # 8. Monthly breakdown table - monthly_table = [] - for month in months_sorted: - # Get per-source counts - sources: dict[str, int] = {} - total = 0 - for s in monthly_submissions: - if s["month"] == month: - sources[s["source"]] = s["count"] - total += s["count"] - # Get avg score - avg_row = conn.execute(""" - SELECT AVG((r.novelty + r.maturity + r.overlap + r.momentum + r.relevance) / 5.0) AS avg_score - FROM drafts d JOIN ratings r ON d.name = r.draft_name - WHERE substr(d.time, 1, 7) = ? AND r.false_positive = 0 - """, (month,)).fetchone() - avg_score = round(avg_row["avg_score"], 2) if avg_row and avg_row["avg_score"] else 0 - monthly_table.append({ - "month": month, - "total": total, - "sources": sources, - "avg_score": avg_score, - }) - - return { - "monthly_submissions": monthly_submissions, - "monthly_ratings": monthly_ratings, - "monthly_categories": monthly_categories, - "safety_ratio": safety_ratio, - "cumulative_ideas": cumulative_ideas, - "monthly_new_authors": monthly_new_authors, - "top_categories": top_cats, - "months": months_sorted, - "stats": { - "fastest_growing": fastest_growing, - "newest_active": newest_active, - }, - "monthly_table": monthly_table, - } - - -# --------------------------------------------------------------------------- -# Draft Complexity Matrix -# --------------------------------------------------------------------------- - - -def get_complexity_data(db: Database) -> dict: - """Return draft complexity analysis data for the /complexity page. - - For each rated draft, compute structural complexity metrics and - correlate with rating dimensions. - - Returns dict with: - - drafts: [{name, title, pages, author_count, citation_count, idea_count, - category_count, novelty, maturity, overlap, momentum, relevance, - score, composite_complexity}, ...] - - correlations: {metric: {dimension: r_value}} - - top_complex: top 10 most complex drafts - - top_efficient: top 10 high-rating low-complexity drafts - - stats: {avg_pages, avg_authors, avg_citations, pages_coverage_pct} - - category_complexity: [{category, avg_pages, avg_authors, avg_citations, count}, ...] - - source_complexity: [{source, avg_pages, avg_authors, avg_citations, count}, ...] - """ - conn = db.conn - - # Build per-draft complexity data - rows = conn.execute(""" - SELECT d.name, d.title, d.pages, d.source, - r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, - r.categories, - (r.novelty + r.maturity + r.overlap + r.momentum + r.relevance) / 5.0 AS score - FROM drafts d - JOIN ratings r ON d.name = r.draft_name - WHERE r.false_positive = 0 - """).fetchall() - - # Author counts - author_counts = db.draft_author_count_map() - - # Citation counts (outgoing refs) - citation_counts = {} - for row in conn.execute(""" - SELECT draft_name, COUNT(*) AS cnt FROM draft_refs GROUP BY draft_name - """).fetchall(): - citation_counts[row["draft_name"]] = row["cnt"] - - # Idea counts - idea_counts = {} - for row in conn.execute(""" - SELECT draft_name, COUNT(*) AS cnt FROM ideas GROUP BY draft_name - """).fetchall(): - idea_counts[row["draft_name"]] = row["cnt"] - - drafts_data = [] - total_with_pages = 0 - total_drafts = 0 - for r in rows: - total_drafts += 1 - pages = r["pages"] - if pages is not None: - total_with_pages += 1 - try: - cats = json.loads(r["categories"]) if r["categories"] else [] - except (json.JSONDecodeError, TypeError): - cats = [] - ac = author_counts.get(r["name"], 0) - cc = citation_counts.get(r["name"], 0) - ic = idea_counts.get(r["name"], 0) - cat_count = len(cats) - # Composite complexity: normalize each metric to 0-1 scale and average - # (raw values stored; composite calculated after we know max values) - drafts_data.append({ - "name": r["name"], - "title": r["title"], - "pages": pages, - "source": r["source"] or "ietf", - "author_count": ac, - "citation_count": cc, - "idea_count": ic, - "category_count": cat_count, - "categories": cats, - "novelty": r["novelty"], - "maturity": r["maturity"], - "overlap": r["overlap"], - "momentum": r["momentum"], - "relevance": r["relevance"], - "score": round(r["score"], 2), - }) - - # Compute composite complexity score (normalized 0-1 each, then averaged) - max_pages = max((d["pages"] for d in drafts_data if d["pages"] is not None), default=1) or 1 - max_authors = max((d["author_count"] for d in drafts_data), default=1) or 1 - max_citations = max((d["citation_count"] for d in drafts_data), default=1) or 1 - max_ideas = max((d["idea_count"] for d in drafts_data), default=1) or 1 - - for d in drafts_data: - p = (d["pages"] / max_pages) if d["pages"] is not None else 0.3 # default to median-ish - a = d["author_count"] / max_authors - c = d["citation_count"] / max_citations - i = d["idea_count"] / max_ideas - d["composite_complexity"] = round((p + a + c + i) / 4, 3) - - # Correlation matrix: complexity metrics vs rating dimensions - metrics = ["pages", "author_count", "citation_count", "idea_count", "category_count"] - dimensions = ["novelty", "maturity", "overlap", "momentum", "relevance"] - - def _pearson(xs: list[float], ys: list[float]) -> float: - """Compute Pearson correlation coefficient.""" - n = len(xs) - if n < 3: - return 0.0 - mean_x = sum(xs) / n - mean_y = sum(ys) / n - cov = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys)) - std_x = (sum((x - mean_x) ** 2 for x in xs)) ** 0.5 - std_y = (sum((y - mean_y) ** 2 for y in ys)) ** 0.5 - if std_x == 0 or std_y == 0: - return 0.0 - return round(cov / (std_x * std_y), 3) - - correlations: dict[str, dict[str, float]] = {} - for metric in metrics: - correlations[metric] = {} - for dim in dimensions: - if metric == "pages": - # Filter to drafts with pages data - pairs = [(d[metric], d[dim]) for d in drafts_data if d[metric] is not None] - else: - pairs = [(d[metric], d[dim]) for d in drafts_data] - if len(pairs) >= 3: - xs, ys = zip(*pairs) - correlations[metric][dim] = _pearson(list(xs), list(ys)) - else: - correlations[metric][dim] = 0.0 - - # Top 10 most complex - sorted_by_complexity = sorted(drafts_data, key=lambda d: d["composite_complexity"], reverse=True) - top_complex = sorted_by_complexity[:10] - - # Top 10 efficient: high score but low complexity - # Efficiency = score / (composite_complexity + 0.1) (avoid div by zero) - for d in drafts_data: - d["efficiency"] = round(d["score"] / (d["composite_complexity"] + 0.1), 2) - sorted_by_efficiency = sorted(drafts_data, key=lambda d: d["efficiency"], reverse=True) - top_efficient = sorted_by_efficiency[:10] - - # Stats - pages_vals = [d["pages"] for d in drafts_data if d["pages"] is not None] - avg_pages = round(sum(pages_vals) / len(pages_vals), 1) if pages_vals else 0 - avg_authors = round(sum(d["author_count"] for d in drafts_data) / len(drafts_data), 1) if drafts_data else 0 - avg_citations = round(sum(d["citation_count"] for d in drafts_data) / len(drafts_data), 1) if drafts_data else 0 - pages_coverage = round(total_with_pages / total_drafts * 100, 1) if total_drafts else 0 - - # Category complexity averages - cat_data: dict[str, list[dict]] = defaultdict(list) - for d in drafts_data: - for cat in d.get("categories", []): - cat_data[cat].append(d) - - category_complexity = [] - for cat, ds in sorted(cat_data.items(), key=lambda x: -len(x[1])): - p_vals = [d["pages"] for d in ds if d["pages"] is not None] - category_complexity.append({ - "category": cat, - "avg_pages": round(sum(p_vals) / len(p_vals), 1) if p_vals else 0, - "avg_authors": round(sum(d["author_count"] for d in ds) / len(ds), 1), - "avg_citations": round(sum(d["citation_count"] for d in ds) / len(ds), 1), - "avg_score": round(sum(d["score"] for d in ds) / len(ds), 2), - "count": len(ds), - }) - - # Source complexity - source_data: dict[str, list[dict]] = defaultdict(list) - for d in drafts_data: - source_data[d["source"]].append(d) - - source_complexity = [] - for src, ds in sorted(source_data.items(), key=lambda x: -len(x[1])): - p_vals = [d["pages"] for d in ds if d["pages"] is not None] - source_complexity.append({ - "source": src, - "avg_pages": round(sum(p_vals) / len(p_vals), 1) if p_vals else 0, - "avg_authors": round(sum(d["author_count"] for d in ds) / len(ds), 1), - "avg_citations": round(sum(d["citation_count"] for d in ds) / len(ds), 1), - "avg_score": round(sum(d["score"] for d in ds) / len(ds), 2), - "count": len(ds), - }) - - return { - "drafts": drafts_data, - "correlations": correlations, - "metrics": metrics, - "dimensions": dimensions, - "top_complex": top_complex, - "top_efficient": top_efficient, - "stats": { - "avg_pages": avg_pages, - "avg_authors": avg_authors, - "avg_citations": avg_citations, - "pages_coverage_pct": pages_coverage, - "total_drafts": total_drafts, - }, - "category_complexity": category_complexity, - "source_complexity": source_complexity, - } - - -# ── Additional Analysis Functions ──────────────────────────────────── - -def get_idea_analysis(db: Database) -> dict: - """Return comprehensive idea analysis data for the idea-analysis page. - - Includes novelty distribution, type breakdown with avg novelty, - top novel ideas, ideas-per-draft distribution, cross-tab of type x source, - shared ideas across drafts, and idea novelty vs draft rating correlation. - """ - from collections import Counter, defaultdict - from difflib import SequenceMatcher - - # Fetch raw data - all_ideas = db.conn.execute( - """SELECT i.id, i.draft_name, i.title, i.description, i.idea_type, - i.novelty_score - FROM ideas i ORDER BY i.novelty_score DESC NULLS LAST""" - ).fetchall() - all_ideas = [dict(r) for r in all_ideas] - - # Draft ratings lookup - ratings_rows = db.conn.execute( - """SELECT d.name, d.title as draft_title, d.source, - r.novelty AS r_novelty, r.maturity, r.overlap, r.momentum, r.relevance - FROM drafts d LEFT JOIN ratings r ON d.name = r.draft_name""" - ).fetchall() - draft_info = {} - for r in ratings_rows: - row = dict(r) - # Compute composite score (average of 5 dimensions) - dims = [row.get("r_novelty"), row.get("maturity"), row.get("overlap"), - row.get("momentum"), row.get("relevance")] - valid = [d for d in dims if d is not None] - row["composite_score"] = sum(valid) / len(valid) if valid else None - draft_info[row["name"]] = row - - total = len(all_ideas) - scored = [i for i in all_ideas if i.get("novelty_score") is not None] - unscored = total - len(scored) - avg_novelty = sum(i["novelty_score"] for i in scored) / len(scored) if scored else 0 - - # Embedding coverage - embed_count = db.conn.execute("SELECT COUNT(*) FROM idea_embeddings").fetchone()[0] - - # --- Novelty score distribution (histogram) --- - novelty_dist = Counter(i["novelty_score"] for i in scored) - novelty_histogram = { - "labels": [1, 2, 3, 4, 5], - "values": [novelty_dist.get(s, 0) for s in [1, 2, 3, 4, 5]], - } - - # --- Ideas by type with counts and avg novelty --- - type_data = defaultdict(lambda: {"count": 0, "novelty_sum": 0, "novelty_n": 0}) - for idea in all_ideas: - t = idea.get("idea_type") or "other" - type_data[t]["count"] += 1 - if idea.get("novelty_score") is not None: - type_data[t]["novelty_sum"] += idea["novelty_score"] - type_data[t]["novelty_n"] += 1 - - by_type = [] - for t, d in sorted(type_data.items(), key=lambda x: x[1]["count"], reverse=True): - avg = d["novelty_sum"] / d["novelty_n"] if d["novelty_n"] > 0 else 0 - by_type.append({"type": t, "count": d["count"], "avg_novelty": round(avg, 2)}) - - type_names = [t["type"] for t in by_type] - - # --- Top 20 most novel ideas (score 4-5) --- - top_novel = [] - for idea in all_ideas: - if idea.get("novelty_score") and idea["novelty_score"] >= 4: - di = draft_info.get(idea["draft_name"], {}) - top_novel.append({ - "title": idea["title"], - "description": idea["description"], - "type": idea.get("idea_type", "other"), - "novelty_score": idea["novelty_score"], - "draft_name": idea["draft_name"], - "draft_title": di.get("draft_title", ""), - "draft_score": di.get("composite_score"), - }) - top_novel.sort(key=lambda x: (x["novelty_score"], x.get("draft_score") or 0), reverse=True) - top_novel = top_novel[:20] - - # --- Ideas per draft distribution --- - ideas_per_draft = Counter(i["draft_name"] for i in all_ideas) - ipd_dist = Counter(ideas_per_draft.values()) - ideas_per_draft_hist = { - "labels": sorted(ipd_dist.keys()), - "values": [ipd_dist[k] for k in sorted(ipd_dist.keys())], - } - # Also top drafts by idea count - top_idea_drafts = [] - for name, count in ideas_per_draft.most_common(10): - di = draft_info.get(name, {}) - top_idea_drafts.append({ - "name": name, - "draft_title": di.get("draft_title", ""), - "idea_count": count, - "score": di.get("composite_score"), - }) - - # --- Cross-tabulation: idea_type x source --- - type_source = defaultdict(lambda: defaultdict(int)) - for idea in all_ideas: - t = idea.get("idea_type") or "other" - di = draft_info.get(idea["draft_name"], {}) - source = di.get("source", "ietf") or "ietf" - type_source[t][source] += 1 - - sources = sorted(set( - di.get("source", "ietf") or "ietf" for di in draft_info.values() - )) - cross_tab = [] - for t in type_names: - row = {"type": t} - for s in sources: - row[s] = type_source[t].get(s, 0) - cross_tab.append(row) - - # --- Shared ideas across drafts --- - idea_groups: list[dict] = [] - for idea in all_ideas: - title_lower = idea["title"].lower().strip() - matched = False - for group in idea_groups: - ratio = SequenceMatcher(None, title_lower, group["canonical"]).ratio() - if ratio >= 0.75: - group["ideas"].append(idea) - group["drafts"].add(idea["draft_name"]) - matched = True - break - if not matched: - idea_groups.append({ - "canonical": title_lower, - "title": idea["title"], - "ideas": [idea], - "drafts": {idea["draft_name"]}, - }) - - shared_ideas = [] - for g in sorted(idea_groups, key=lambda x: len(x["drafts"]), reverse=True): - if len(g["drafts"]) < 2: - break - shared_ideas.append({ - "title": g["title"], - "appearances": len(g["drafts"]), - "drafts": sorted(g["drafts"])[:8], - "types": list(set(i.get("idea_type", "other") for i in g["ideas"])), - }) - - # --- Scatter: draft avg idea novelty vs draft relevance --- - draft_idea_novelty = defaultdict(list) - for idea in scored: - draft_idea_novelty[idea["draft_name"]].append(idea["novelty_score"]) - - scatter_data = [] - for name, scores in draft_idea_novelty.items(): - di = draft_info.get(name, {}) - if di.get("relevance") is not None and di.get("composite_score") is not None: - scatter_data.append({ - "name": name, - "avg_idea_novelty": round(sum(scores) / len(scores), 2), - "relevance": di["relevance"], - "score": di["composite_score"], - "idea_count": len(scores), - "source": di.get("source", "ietf") or "ietf", - }) - - # --- Sunburst data: type -> novelty band --- - sunburst_labels = [] - sunburst_parents = [] - sunburst_values = [] - # Root - sunburst_labels.append("All Ideas") - sunburst_parents.append("") - sunburst_values.append(total) - - novelty_bands = {"High (4-5)": lambda s: s is not None and s >= 4, - "Medium (3)": lambda s: s is not None and s == 3, - "Low (1-2)": lambda s: s is not None and s <= 2, - "Unscored": lambda s: s is None} - - for t_info in by_type: - t = t_info["type"] - sunburst_labels.append(t) - sunburst_parents.append("All Ideas") - sunburst_values.append(t_info["count"]) - # Sub-bands - type_ideas = [i for i in all_ideas if (i.get("idea_type") or "other") == t] - for band, fn in novelty_bands.items(): - cnt = sum(1 for i in type_ideas if fn(i.get("novelty_score"))) - if cnt > 0: - sunburst_labels.append(f"{t} - {band}") - sunburst_parents.append(t) - sunburst_values.append(cnt) - - return { - "total": total, - "scored": len(scored), - "unscored": unscored, - "avg_novelty": round(avg_novelty, 2), - "embed_count": embed_count, - "embed_pct": round(embed_count / total * 100, 1) if total > 0 else 0, - "type_count": len(by_type), - "novelty_histogram": novelty_histogram, - "by_type": by_type, - "top_novel": top_novel, - "ideas_per_draft_hist": ideas_per_draft_hist, - "top_idea_drafts": top_idea_drafts, - "cross_tab": cross_tab, - "sources": sources, - "shared_ideas": shared_ideas, - "scatter_data": scatter_data, - "sunburst": { - "labels": sunburst_labels, - "parents": sunburst_parents, - "values": sunburst_values, - }, - } - - - - -def get_source_comparison(db: Database) -> dict: - """Cross-source comparison: ratings, categories, counts by standards body.""" - pairs_all = db.drafts_with_ratings(limit=2000) - # Also include false positives for completeness of source counts - pairs_fp = db.drafts_with_ratings(limit=2000, include_false_positives=True) - - # Build per-source data - source_stats: dict[str, dict] = {} - source_categories: dict[str, Counter] = defaultdict(Counter) - source_ratings: dict[str, dict[str, list]] = defaultdict(lambda: { - "novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": [], "scores": [], - }) - # Collect author counts per source - all_authors_by_source: dict[str, set] = defaultdict(set) - - for draft, rating in pairs_all: - src = getattr(draft, "source", "ietf") or "ietf" - source_ratings[src]["novelty"].append(rating.novelty) - source_ratings[src]["maturity"].append(rating.maturity) - source_ratings[src]["overlap"].append(rating.overlap) - source_ratings[src]["momentum"].append(rating.momentum) - source_ratings[src]["relevance"].append(rating.relevance) - source_ratings[src]["scores"].append(round(rating.composite_score, 2)) - for cat in rating.categories: - source_categories[src][cat] += 1 - - # Get all drafts (including unrated) for draft counts - all_drafts = db.list_drafts(limit=5000) - source_draft_counts: Counter = Counter() - for d in all_drafts: - src = getattr(d, "source", "ietf") or "ietf" - source_draft_counts[src] += 1 - - # Author counts by source - try: - rows = db.conn.execute( - """SELECT d.source, COUNT(DISTINCT da.person_id) as author_count - FROM drafts d - JOIN draft_authors da ON d.name = da.draft_name - GROUP BY d.source""" - ).fetchall() - for r in rows: - src = r["source"] or "ietf" - all_authors_by_source[src] = r["author_count"] - except Exception: - pass - - # Idea counts by source - source_idea_counts: Counter = Counter() - try: - rows = db.conn.execute( - """SELECT d.source, COUNT(*) as idea_count - FROM ideas i - JOIN drafts d ON i.draft_name = d.name - GROUP BY d.source""" - ).fetchall() - for r in rows: - src = r["source"] or "ietf" - source_idea_counts[src] = r["idea_count"] - except Exception: - pass - - # Build summary table - all_sources = sorted(set(source_draft_counts.keys()) | set(source_ratings.keys())) - summary = [] - for src in all_sources: - rats = source_ratings.get(src, {"scores": []}) - cats = source_categories.get(src, Counter()) - top_cat = cats.most_common(1)[0][0] if cats else "N/A" - avg_score = round(sum(rats["scores"]) / len(rats["scores"]), 2) if rats["scores"] else 0.0 - summary.append({ - "source": src, - "drafts": source_draft_counts.get(src, 0), - "rated": len(rats["scores"]), - "authors": all_authors_by_source.get(src, 0), - "ideas": source_idea_counts.get(src, 0), - "avg_score": avg_score, - "top_category": top_cat, - }) - - # Radar data: average of each dimension per source - radar = {} - for src, rats in source_ratings.items(): - if not rats["scores"]: - continue - n = len(rats["scores"]) - radar[src] = { - "novelty": round(sum(rats["novelty"]) / n, 2), - "maturity": round(sum(rats["maturity"]) / n, 2), - "overlap": round(sum(rats["overlap"]) / n, 2), - "momentum": round(sum(rats["momentum"]) / n, 2), - "relevance": round(sum(rats["relevance"]) / n, 2), - "count": n, - } - - # Category distribution by source (for stacked bar / heatmap) - all_cats = sorted({cat for cats in source_categories.values() for cat in cats}) - heatmap = { - "sources": list(source_categories.keys()), - "categories": all_cats, - "values": [], - } - for src in heatmap["sources"]: - row = [source_categories[src].get(cat, 0) for cat in all_cats] - heatmap["values"].append(row) - - # Unique/shared categories analysis - source_cat_sets = {src: set(cats.keys()) for src, cats in source_categories.items()} - unique_cats = {} - for src, cats in source_cat_sets.items(): - others = set() - for s2, c2 in source_cat_sets.items(): - if s2 != src: - others |= c2 - unique_cats[src] = sorted(cats - others) - - shared_cats = set() - for src, cats in source_cat_sets.items(): - for s2, c2 in source_cat_sets.items(): - if s2 != src: - shared_cats |= (cats & c2) - shared_cats = sorted(shared_cats) - - return { - "summary": summary, - "radar": radar, - "heatmap": heatmap, - "unique_categories": unique_cats, - "shared_categories": shared_cats, - } - - -def get_false_positive_profile(db: Database) -> dict: - """Profile drafts flagged as false positives.""" - # Get false positives - fp_rows = db.false_positive_drafts_raw() - - # Get non-FP rated drafts for comparison - nonfp_rows = db.non_false_positive_ratings_raw() - - total_rated = db.rated_count() - total_drafts = db.count_drafts(include_false_positives=True) - - # Build FP list - fp_list = [] - fp_categories: Counter = Counter() - fp_sources: Counter = Counter() - fp_dims = {"novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": []} - - for row in fp_rows: - cats = json.loads(row["r_categories"]) if row["r_categories"] else [] - src = row["source"] or "ietf" - fp_list.append({ - "name": row["name"], - "title": row["title"], - "source": src, - "categories": cats, - "relevance": row["relevance"], - "novelty": row["novelty"], - "maturity": row["maturity"], - "overlap": row["overlap"], - "momentum": row["momentum"], - "summary": row["summary"] or "", - }) - for cat in cats: - fp_categories[cat] += 1 - fp_sources[src] += 1 - fp_dims["novelty"].append(row["novelty"]) - fp_dims["maturity"].append(row["maturity"]) - fp_dims["overlap"].append(row["overlap"]) - fp_dims["momentum"].append(row["momentum"]) - fp_dims["relevance"].append(row["relevance"]) - - # Non-FP dimensions for comparison - nonfp_dims = {"novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": []} - nonfp_categories: Counter = Counter() - for row in nonfp_rows: - nonfp_dims["novelty"].append(row["novelty"]) - nonfp_dims["maturity"].append(row["maturity"]) - nonfp_dims["overlap"].append(row["overlap"]) - nonfp_dims["momentum"].append(row["momentum"]) - nonfp_dims["relevance"].append(row["relevance"]) - cats = json.loads(row["r_categories"]) if row["r_categories"] else [] - for cat in cats: - nonfp_categories[cat] += 1 - - # Top terms from FP abstracts - from collections import Counter as _Counter - stop_words = { - "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", - "of", "with", "by", "from", "is", "it", "that", "this", "are", "was", - "be", "as", "can", "may", "will", "not", "has", "have", "been", "which", - "their", "its", "also", "such", "these", "would", "should", "could", - "more", "other", "than", "into", "about", "between", "over", "after", - "all", "one", "two", "new", "they", "we", "our", "each", "some", "any", - "there", "what", "when", "how", "where", "who", "does", "do", "did", - "no", "if", "so", "up", "out", "only", "used", "using", "use", "based", - "through", "both", "well", "within", "must", "while", "had", "were", - } - word_counter: Counter = Counter() - for row in fp_rows: - abstract = (row["abstract"] or "").lower() - title = (row["title"] or "").lower() - text = abstract + " " + title - words = re.findall(r'[a-z]{3,}', text) - for w in words: - if w not in stop_words: - word_counter[w] += 1 - top_terms = word_counter.most_common(30) - - return { - "count": len(fp_list), - "total_rated": total_rated, - "total_drafts": total_drafts, - "pct_of_total": round(100 * len(fp_list) / total_drafts, 1) if total_drafts else 0, - "pct_of_rated": round(100 * len(fp_list) / total_rated, 1) if total_rated else 0, - "fp_list": fp_list, - "fp_categories": dict(fp_categories.most_common()), - "fp_sources": dict(fp_sources.most_common()), - "fp_dims": fp_dims, - "nonfp_dims": nonfp_dims, - "top_terms": top_terms, - "nonfp_categories": dict(nonfp_categories.most_common(20)), - } - - -def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: - """Search-only (free) — returns sources + cached answer if available.""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.search_only(question, top_k=top_k) - - -def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict: - """Run Claude synthesis (costs tokens, result is cached permanently).""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.ask(question, top_k=top_k, cheap=cheap) - - -def get_citation_influence(db: Database) -> dict: - """Return citation influence analysis data (cached for 5 min).""" - return _cached("citation_influence", lambda: _compute_citation_influence(db)) - - -def _compute_citation_influence(db: Database) -> dict: - """Compute citation influence metrics from the draft_refs table. - - Returns dict with: - - top_cited_rfcs: top 20 most-cited RFCs with citation counts and citing drafts - - top_citing_drafts: top 20 drafts that cite the most references - - citations_by_category: average citations per category - - stats: total citations, unique RFCs, avg refs per draft - - draft_network: draft-to-draft citation edges for visualization - """ - # Get all references - rows = db.conn.execute( - "SELECT draft_name, ref_type, ref_id FROM draft_refs" - ).fetchall() - - # Get draft titles and categories - draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() - draft_titles = {r["name"]: r["title"] for r in draft_rows} - - rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() - draft_cats: dict[str, str] = {} - for r in rating_rows: - try: - cats = json.loads(r["categories"]) if r["categories"] else [] - draft_cats[r["draft_name"]] = cats[0] if cats else "Other" - except Exception: - draft_cats[r["draft_name"]] = "Other" - - # Well-known RFC names - rfc_names = { - "2119": "Key words (MUST/SHALL/MAY)", "8174": "Key words update", - "8259": "JSON", "7519": "JWT", "6749": "OAuth 2.0", - "7540": "HTTP/2", "9110": "HTTP Semantics", "7525": "TLS Recommendations", - "8446": "TLS 1.3", "3986": "URIs", "7230": "HTTP/1.1 Syntax", - "7231": "HTTP/1.1 Semantics", "8288": "Web Linking", "6125": "TLS Server Identity", - "7515": "JWS", "7516": "JWE", "7517": "JWK", "7518": "JWA", - "9449": "DPoP", "6750": "OAuth Bearer", "8725": "JWT Best Practices", - "9396": "Rich Authorization Requests", "9101": "JAR", - "8414": "OAuth Server Metadata", "7591": "Dynamic Client Registration", - "8705": "mTLS for OAuth", "9068": "JWT Access Tokens", - "6819": "OAuth Threat Model", "9200": "ACE-OAuth", "9052": "COSE", - "8392": "CWT", "7252": "CoAP", - } - - # In-degree: how many times each RFC is cited - rfc_citations: dict[str, list[str]] = defaultdict(list) - draft_out_count: dict[str, int] = Counter() - draft_to_draft_edges = [] - total_citations = 0 - - for r in rows: - draft_name = r["draft_name"] - ref_type = r["ref_type"] - ref_id = r["ref_id"] - total_citations += 1 - draft_out_count[draft_name] += 1 - - if ref_type == "rfc": - rfc_citations[ref_id].append(draft_name) - elif ref_type == "draft": - draft_to_draft_edges.append({ - "source": draft_name, - "target": ref_id, - "source_title": draft_titles.get(draft_name, draft_name), - "target_title": draft_titles.get(ref_id, ref_id), - }) - - # Top 20 most-cited RFCs - rfc_sorted = sorted(rfc_citations.items(), key=lambda x: len(x[1]), reverse=True) - top_cited_rfcs = [] - for ref_id, citing_drafts in rfc_sorted[:20]: - top_cited_rfcs.append({ - "rfc_id": ref_id, - "name": rfc_names.get(ref_id, ""), - "count": len(citing_drafts), - "drafts": citing_drafts[:10], # Limit to first 10 for display - "total_drafts": len(citing_drafts), - }) - - # Top 20 most-citing drafts (out-degree) - draft_sorted = sorted(draft_out_count.items(), key=lambda x: x[1], reverse=True) - top_citing_drafts = [] - for draft_name, count in draft_sorted[:20]: - top_citing_drafts.append({ - "name": draft_name, - "title": draft_titles.get(draft_name, draft_name), - "count": count, - "category": draft_cats.get(draft_name, "Other"), - }) - - # Citation density by category - cat_totals: dict[str, int] = Counter() - cat_counts: dict[str, int] = Counter() - for draft_name, count in draft_out_count.items(): - cat = draft_cats.get(draft_name, "Other") - cat_totals[cat] += count - cat_counts[cat] += 1 - - citations_by_category = [] - for cat in sorted(cat_totals.keys()): - avg = cat_totals[cat] / cat_counts[cat] if cat_counts[cat] > 0 else 0 - citations_by_category.append({ - "category": cat, - "total_citations": cat_totals[cat], - "draft_count": cat_counts[cat], - "avg_citations": round(avg, 1), - }) - citations_by_category.sort(key=lambda x: x["avg_citations"], reverse=True) - - # PageRank-style influence: drafts that cite highly-cited RFCs - # Simple approximation: sum of (1 / citation_count) for each RFC cited - rfc_influence = {rid: len(drafts) for rid, drafts in rfc_citations.items()} - draft_pagerank: dict[str, float] = Counter() - for r in rows: - if r["ref_type"] == "rfc" and r["ref_id"] in rfc_influence: - # Higher score for citing highly-cited RFCs - draft_pagerank[r["draft_name"]] += rfc_influence[r["ref_id"]] - - pagerank_sorted = sorted(draft_pagerank.items(), key=lambda x: x[1], reverse=True) - top_pagerank = [] - for draft_name, score in pagerank_sorted[:20]: - top_pagerank.append({ - "name": draft_name, - "title": draft_titles.get(draft_name, draft_name), - "score": round(score, 1), - "category": draft_cats.get(draft_name, "Other"), - "out_degree": draft_out_count.get(draft_name, 0), - }) - - # Stats - unique_rfcs = len(rfc_citations) - drafts_with_refs = len(draft_out_count) - avg_refs = total_citations / drafts_with_refs if drafts_with_refs > 0 else 0 - - return { - "top_cited_rfcs": top_cited_rfcs, - "top_citing_drafts": top_citing_drafts, - "top_pagerank": top_pagerank, - "citations_by_category": citations_by_category, - "draft_network": draft_to_draft_edges[:200], # Limit for perf - "stats": { - "total_citations": total_citations, - "unique_rfcs": unique_rfcs, - "drafts_with_refs": drafts_with_refs, - "avg_refs_per_draft": round(avg_refs, 1), - }, - } - - -def get_bcp_analysis(db: Database) -> dict: - """Return BCP dependency analysis data (cached for 5 min).""" - return _cached("bcp_analysis", lambda: _compute_bcp_analysis(db)) - - -def _compute_bcp_analysis(db: Database) -> dict: - """Compute BCP dependency analysis. - - Returns dict with: - - bcps: all BCPs with citation counts and citing drafts - - co_citation: which BCPs tend to be co-cited - - by_category: BCP citation patterns by category - - coverage: what % of drafts cite at least one BCP - """ - # Get all BCP references - bcp_rows = db.conn.execute( - "SELECT draft_name, ref_id FROM draft_refs WHERE ref_type = 'bcp'" - ).fetchall() - - # Get draft titles and categories - draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() - draft_titles = {r["name"]: r["title"] for r in draft_rows} - total_drafts = len(draft_titles) - - rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() - draft_cats: dict[str, str] = {} - for r in rating_rows: - try: - cats = json.loads(r["categories"]) if r["categories"] else [] - draft_cats[r["draft_name"]] = cats[0] if cats else "Other" - except Exception: - draft_cats[r["draft_name"]] = "Other" - - # BCP citation counts - bcp_citations: dict[str, list[str]] = defaultdict(list) - draft_bcps: dict[str, list[str]] = defaultdict(list) - - for r in bcp_rows: - bcp_citations[r["ref_id"]].append(r["draft_name"]) - draft_bcps[r["draft_name"]].append(r["ref_id"]) - - # All BCPs with counts - bcps = [] - for bcp_id, citing_drafts in sorted(bcp_citations.items(), - key=lambda x: len(x[1]), reverse=True): - bcps.append({ - "bcp_id": bcp_id, - "count": len(citing_drafts), - "drafts": citing_drafts[:10], - "total_drafts": len(citing_drafts), - }) - - # Co-citation matrix: which BCPs appear together in the same draft - bcp_ids = sorted(bcp_citations.keys()) - co_citation = [] - for i, bcp_a in enumerate(bcp_ids): - drafts_a = set(bcp_citations[bcp_a]) - for j, bcp_b in enumerate(bcp_ids): - if j <= i: - continue - drafts_b = set(bcp_citations[bcp_b]) - shared = len(drafts_a & drafts_b) - if shared > 0: - co_citation.append({ - "bcp_a": bcp_a, - "bcp_b": bcp_b, - "count": shared, - }) - - # Heatmap data: full matrix for all BCPs (top 20 by citation count) - top_bcp_ids = [b["bcp_id"] for b in bcps[:20]] - heatmap_matrix = [] - for bcp_a in top_bcp_ids: - row = [] - drafts_a = set(bcp_citations.get(bcp_a, [])) - for bcp_b in top_bcp_ids: - drafts_b = set(bcp_citations.get(bcp_b, [])) - shared = len(drafts_a & drafts_b) - row.append(shared) - heatmap_matrix.append(row) - - # BCP citations by category - cat_bcp_count: dict[str, Counter] = defaultdict(Counter) - for draft_name, bcp_list in draft_bcps.items(): - cat = draft_cats.get(draft_name, "Other") - for bcp_id in bcp_list: - cat_bcp_count[cat][bcp_id] += 1 - - by_category = [] - for cat in sorted(cat_bcp_count.keys()): - top_bcps = cat_bcp_count[cat].most_common(5) - by_category.append({ - "category": cat, - "total_bcp_refs": sum(cat_bcp_count[cat].values()), - "unique_bcps": len(cat_bcp_count[cat]), - "top_bcps": [{"bcp_id": bid, "count": c} for bid, c in top_bcps], - }) - by_category.sort(key=lambda x: x["total_bcp_refs"], reverse=True) - - # Coverage - drafts_with_bcp = len(draft_bcps) - coverage_pct = (drafts_with_bcp / total_drafts * 100) if total_drafts > 0 else 0 - - return { - "bcps": bcps, - "co_citation": co_citation, - "heatmap_labels": top_bcp_ids, - "heatmap_matrix": heatmap_matrix, - "by_category": by_category, - "coverage": { - "total_drafts": total_drafts, - "drafts_with_bcp": drafts_with_bcp, - "coverage_pct": round(coverage_pct, 1), - "unique_bcps": len(bcp_citations), - "total_bcp_refs": len(bcp_rows), - }, - } - - -def global_search(db: Database, query: str) -> SearchResults: - """Search across drafts (FTS5), ideas, authors, and gaps. - - Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}. - """ - results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []} - if not query or not query.strip(): - return results - - q = query.strip() - - # 1. Drafts via FTS5 - try: - fts_query = re.sub(r'[^\w\s]', '', q) - fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE) - fts_query = re.sub(r'\s+', ' ', fts_query).strip() - if not fts_query: - raise ValueError("empty query after sanitization") - rows = db.conn.execute( - """SELECT d.name, d.title, d.abstract, d.time, d."group" - FROM drafts d - JOIN drafts_fts f ON d.rowid = f.rowid - WHERE drafts_fts MATCH ? - ORDER BY rank - LIMIT 50""", - (fts_query,), - ).fetchall() - for r in rows: - results["drafts"].append({ - "name": r["name"], - "title": r["title"], - "abstract": (r["abstract"] or "")[:200], - "date": r["time"], - "group": r["group"] or "individual", - }) - except Exception: - # FTS5 match can fail on certain query syntax; fall back to LIKE - like = f"%{q}%" - rows = db.conn.execute( - """SELECT name, title, abstract, time, "group" FROM drafts - WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ? - LIMIT 50""", - (like, like, like), - ).fetchall() - for r in rows: - results["drafts"].append({ - "name": r["name"], - "title": r["title"], - "abstract": (r["abstract"] or "")[:200], - "date": r["time"], - "group": r["group"] or "individual", - }) - - # 2. Ideas via LIKE - like = f"%{q}%" - rows = db.conn.execute( - """SELECT id, title, description, idea_type, draft_name FROM ideas - WHERE title LIKE ? OR description LIKE ? - ORDER BY id LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["ideas"].append({ - "id": r["id"], - "title": r["title"], - "description": (r["description"] or "")[:200], - "type": r["idea_type"], - "draft_name": r["draft_name"], - }) - - # 3. Authors via LIKE - results["authors"] = db.search_authors(q, limit=50) - - # 4. Gaps via LIKE - results["gaps"] = db.search_gaps(q, limit=50) - - return results - - -def get_landscape_tsne(db: Database) -> list[dict]: - """Compute t-SNE (cached for 5 min).""" - return _cached("landscape_tsne", lambda: _compute_landscape_tsne(db)) - - -def _compute_landscape_tsne(db: Database) -> list[dict]: - """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].""" - - - embeddings = db.all_embeddings() - if len(embeddings) < 5: - return [] - - pairs = db.drafts_with_ratings(limit=1000) - rating_map = {d.name: r for d, r in pairs} - draft_map = {d.name: d for d, _ in pairs} - - # Filter to drafts that have both embeddings and ratings - names = [n for n in embeddings if n in rating_map] - if len(names) < 5: - return [] - - matrix = np.array([embeddings[n] for n in names]) - - try: - tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), - random_state=42, max_iter=500) - coords = tsne.fit_transform(matrix) - except Exception: - return [] - - result = [] - for i, name in enumerate(names): - r = rating_map[name] - d = draft_map.get(name) - result.append({ - "name": name, - "title": d.title if d else name, - "x": round(float(coords[i, 0]), 3), - "y": round(float(coords[i, 1]), 3), - "category": r.categories[0] if r.categories else "Other", - "score": round(r.composite_score, 2), - }) - return result - - -def get_comparison_data(db: Database, names: list[str]) -> dict | None: - """Get comparison data for a list of drafts. - - Returns { - drafts: [{name, title, abstract, rating, ideas, refs, ...}], - shared_ideas: [{title, drafts: [name,...]}], - unique_ideas: {name: [{title, description}]}, - shared_refs: [{type, id, drafts: [name,...]}], - unique_refs: {name: [{type, id}]}, - similarities: [{a, b, similarity}], - comparison_text: str | None, - } - """ - - - drafts_data = [] - all_ideas: dict[str, list[dict]] = {} - all_refs: dict[str, list[tuple[str, str]]] = {} - - for name in names: - detail = get_draft_detail(db, name) - if not detail: - continue - drafts_data.append(detail) - all_ideas[name] = detail.get("ideas", []) - all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])] - - if len(drafts_data) < 2: - return None - - # Find shared vs unique ideas (by title similarity) - idea_title_drafts: dict[str, list[str]] = {} - for name, ideas in all_ideas.items(): - for idea in ideas: - title_lower = idea["title"].lower().strip() - if title_lower not in idea_title_drafts: - idea_title_drafts[title_lower] = [] - idea_title_drafts[title_lower].append(name) - - shared_ideas = [ - {"title": title, "drafts": draft_list} - for title, draft_list in idea_title_drafts.items() - if len(set(draft_list)) > 1 - ] - unique_ideas: dict[str, list[dict]] = {} - for name, ideas in all_ideas.items(): - unique = [] - for idea in ideas: - title_lower = idea["title"].lower().strip() - if len(set(idea_title_drafts.get(title_lower, []))) <= 1: - unique.append({"title": idea["title"], "description": idea.get("description", "")}) - unique_ideas[name] = unique - - # Find shared vs unique references - ref_drafts: dict[tuple[str, str], list[str]] = {} - for name, refs in all_refs.items(): - for ref in refs: - if ref not in ref_drafts: - ref_drafts[ref] = [] - ref_drafts[ref].append(name) - - shared_refs = [ - {"type": ref[0], "id": ref[1], "drafts": draft_list} - for ref, draft_list in ref_drafts.items() - if len(set(draft_list)) > 1 - ] - unique_refs: dict[str, list[dict]] = {} - for name, refs in all_refs.items(): - unique = [] - for ref in refs: - if len(set(ref_drafts.get(ref, []))) <= 1: - unique.append({"type": ref[0], "id": ref[1]}) - unique_refs[name] = unique - - # Pairwise embedding similarities - embeddings = db.all_embeddings() - similarities = [] - valid_names = [d["name"] for d in drafts_data] - for i in range(len(valid_names)): - for j in range(i + 1, len(valid_names)): - a, b = valid_names[i], valid_names[j] - if a in embeddings and b in embeddings: - vec_a = embeddings[a] - vec_b = embeddings[b] - dot = np.dot(vec_a, vec_b) - norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b) - sim = float(dot / norm) if norm > 0 else 0.0 - similarities.append({"a": a, "b": b, "similarity": round(sim, 4)}) - - return { - "drafts": drafts_data, - "shared_ideas": shared_ideas, - "unique_ideas": unique_ideas, - "shared_refs": shared_refs, - "unique_refs": unique_refs, - "similarities": similarities, - "comparison_text": None, - } - - -def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: - """Search-only (free) — returns sources + cached answer if available.""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.search_only(question, top_k=top_k) - - -def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict: - """Run Claude synthesis (costs tokens, result is cached permanently).""" - config = Config.load() - searcher = HybridSearch(config, db) - return searcher.ask(question, top_k=top_k, cheap=cheap) - - -# --- Proposals --- - -def get_all_proposals(db: Database) -> list[dict]: - """Return all proposals with linked gap info.""" - proposals = db.all_proposals() - gaps = {g["id"]: g for g in db.all_gaps()} - for p in proposals: - p["gaps"] = [gaps[gid] for gid in p.get("gap_ids", []) if gid in gaps] - return proposals - - -def get_proposal_detail(db: Database, proposal_id: int) -> dict | None: - """Return a single proposal with full gap details.""" - p = db.get_proposal(proposal_id) - if not p: - return None - gaps = {g["id"]: g for g in db.all_gaps()} - p["gaps"] = [gaps[gid] for gid in p.get("gap_ids", []) if gid in gaps] - return p - - -def get_proposals_for_gap(db: Database, gap_id: int) -> list[dict]: - """Return proposals linked to a specific gap.""" - return db.get_proposals_for_gap(gap_id) diff --git a/src/webui/data/__init__.py b/src/webui/data/__init__.py new file mode 100644 index 0000000..8aac5b3 --- /dev/null +++ b/src/webui/data/__init__.py @@ -0,0 +1,97 @@ +"""Data access layer for the web dashboard. + +Thin wrapper around ietf_analyzer.db.Database that returns plain dicts +ready for JSON serialization or Jinja2 template rendering. + +All public functions are re-exported here for backward compatibility: + from webui.data import get_overview_stats +""" +from __future__ import annotations + +# Shared utilities +from webui.data._shared import get_db, _cached, _extract_month # noqa: F401 + +# Drafts +from webui.data.drafts import ( # noqa: F401 + OverviewStats, + DraftListItem, + DraftsPage, + get_overview_stats, + get_category_counts, + get_category_summary, + get_drafts_page, + get_draft_detail, + get_generated_drafts, + read_generated_draft, +) + +# Authors +from webui.data.authors import ( # noqa: F401 + AuthorInfo, + AuthorNetworkNode, + AuthorNetworkEdge, + AuthorCluster, + AuthorNetwork, + get_top_authors, + get_org_data, + get_coauthor_network, + get_cross_org_data, + get_author_network_full, +) + +# Ratings +from webui.data.ratings import ( # noqa: F401 + get_rating_distributions, + get_category_radar_data, + get_score_histogram, + get_false_positive_profile, +) + +# Gaps +from webui.data.gaps import ( # noqa: F401 + get_all_gaps, + get_gap_detail, +) + +# Analysis & Visualization +from webui.data.analysis import ( # noqa: F401 + TimelineData, + SimilarityGraphStats, + SimilarityGraph, + CitationGraphStats, + CitationGraph, + MonitorCost, + MonitorPipeline, + MonitorStatus, + get_ideas_by_type, + get_timeline_data, + get_similarity_graph, + get_idea_clusters, + get_timeline_animation_data, + get_monitor_status, + get_citation_graph, + get_landscape_tsne, + get_comparison_data, + get_architecture, + get_idea_analysis, + get_trends_data, + get_complexity_data, + get_source_comparison, + get_citation_influence, + get_bcp_analysis, +) + +# Search +from webui.data.search import ( # noqa: F401 + SearchResults, + global_search, + get_ask_search, + get_ask_synthesize, +) + +# Proposals +from webui.data.proposals import ( # noqa: F401 + get_all_proposals, + get_proposal_detail, + get_proposals_for_gap, +) diff --git a/src/webui/data/_shared.py b/src/webui/data/_shared.py new file mode 100644 index 0000000..9ab8e57 --- /dev/null +++ b/src/webui/data/_shared.py @@ -0,0 +1,46 @@ +"""Shared utilities for webui data modules.""" +from __future__ import annotations + +import sys +import time +from pathlib import Path + +# Ensure project src is on path +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root / "src")) + +from ietf_analyzer.config import Config +from ietf_analyzer.db import Database +from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch + +# Simple TTL cache for expensive computations (t-SNE, clustering, similarity) +_cache: dict[str, tuple[float, object]] = {} +_CACHE_TTL = 300 # 5 minutes + + +def _extract_month(time_str: str | None) -> str: + """Normalize a date string to YYYY-MM format.""" + if not time_str: + return "unknown" + if len(time_str) >= 7 and time_str[4] == '-': + return time_str[:7] # Already YYYY-MM-DD + if len(time_str) >= 6 and time_str[:4].isdigit(): + return time_str[:4] + '-' + time_str[4:6] # YYYYMMDD → YYYY-MM + return time_str[:7] + +def _cached(key: str, fn, ttl: float = _CACHE_TTL): + """Return cached result or compute and cache it.""" + now = time.monotonic() + if key in _cache: + ts, val = _cache[key] + if now - ts < ttl: + return val + val = fn() + _cache[key] = (now, val) + return val + +def get_db() -> Database: + """Get a Database instance using default config.""" + config = Config.load() + return Database(config) diff --git a/src/webui/data/analysis.py b/src/webui/data/analysis.py new file mode 100644 index 0000000..6c962b2 --- /dev/null +++ b/src/webui/data/analysis.py @@ -0,0 +1,1968 @@ +"""Analysis, visualization, and complex computation data access functions.""" +from __future__ import annotations + +import json +import re +from collections import Counter, defaultdict +from typing import TypedDict + +import numpy as np +from sklearn.cluster import AgglomerativeClustering +from sklearn.manifold import TSNE +from sklearn.preprocessing import normalize as sk_normalize + +from ietf_analyzer.config import Config +from ietf_analyzer.db import Database +from webui.data._shared import _cached, _extract_month +from webui.data.drafts import get_draft_detail + +_ARCH_LAYERS = [ + {"id": "transport", "label": "Transport & Networking", "order": 0, + "keywords": {"transport", "network", "routing", "tunnel", "packet", "flow", "traffic", "qos", "sdwan", "mpls", "bgp", "ospf", "segment", "srv6", "quic", "http", "grpc", "mqtt", "yang", "snmp", "netconf", "restconf"}}, + {"id": "identity", "label": "Identity & Trust", "order": 1, + "keywords": {"identity", "auth", "authentication", "authorization", "credential", "certificate", "trust", "attestation", "oauth", "token", "signing", "verification", "verifiable", "did", "vc", "pki", "spiffe", "acl"}}, + {"id": "discovery", "label": "Discovery & Registration", "order": 2, + "keywords": {"discovery", "registration", "registry", "catalog", "advertisement", "announce", "capability", "service", "lookup", "resolution", "dns", "directory"}}, + {"id": "communication", "label": "Agent Communication", "order": 3, + "keywords": {"a2a", "agent", "communication", "message", "messaging", "protocol", "exchange", "negotiation", "handshake", "session", "dialogue", "interaction", "mcp", "interop"}}, + {"id": "coordination", "label": "Task & Coordination", "order": 4, + "keywords": {"task", "delegation", "orchestration", "workflow", "planning", "coordination", "consensus", "collaboration", "multi-agent", "swarm", "composition", "scheduling"}}, + {"id": "intelligence", "label": "AI & Inference", "order": 5, + "keywords": {"model", "inference", "learning", "training", "ml", "neural", "llm", "embedding", "reasoning", "decision", "prediction", "classification", "generative", "rag", "fine-tuning"}}, + {"id": "safety", "label": "Safety & Governance", "order": 6, + "keywords": {"safety", "ethical", "governance", "policy", "audit", "explainability", "transparency", "accountability", "bias", "fairness", "compliance", "regulation", "risk", "shutdown", "alignment", "adversarial", "privacy", "consent"}}, + {"id": "application", "label": "Application Domains", "order": 7, + "keywords": {"healthcare", "autonomous", "vehicle", "robotics", "iot", "digital twin", "supply chain", "finance", "manufacturing", "energy", "smart", "edge", "cloud", "sensing"}}, +] + +_LAYER_KEYWORDS = {l["id"]: l["keywords"] for l in _ARCH_LAYERS} + + +class TimelineData(TypedDict): + """Monthly category counts from :func:`get_timeline_data`.""" + months: list[str] + series: dict[str, list[int]] + categories: list[str] + +class SimilarityGraphStats(TypedDict): + """Stats sub-dict in similarity graph.""" + node_count: int + edge_count: int + avg_similarity: float + +class SimilarityGraph(TypedDict): + """Draft similarity network from :func:`get_similarity_graph`.""" + nodes: list[dict] + edges: list[dict] + stats: SimilarityGraphStats + +class CitationGraphStats(TypedDict): + """Stats sub-dict in citation graph.""" + node_count: int + edge_count: int + rfc_count: int + draft_count: int + +class CitationGraph(TypedDict): + """Citation network from :func:`get_citation_graph`.""" + nodes: list[dict] + edges: list[dict] + stats: CitationGraphStats + +class MonitorCost(TypedDict): + """Cost sub-dict in monitor status.""" + input_tokens: int + output_tokens: int + estimated_usd: float + +class MonitorPipeline(TypedDict): + """Pipeline sub-dict in monitor status.""" + total_drafts: int + rated: int + embedded: int + with_ideas: int + idea_total: int + gap_count: int + +class MonitorStatus(TypedDict): + """Monitor status from :func:`get_monitor_status`.""" + last_run: dict | None + runs: list[dict] + unprocessed: dict[str, int] + total_runs: int + pipeline: MonitorPipeline + cost: MonitorCost + +def get_ideas_by_type(db: Database) -> dict: + """Return ideas grouped by type with counts.""" + all_ideas = db.all_ideas() + type_counts = Counter(i.get("type", "other") or "other" for i in all_ideas) + return { + "total": len(all_ideas), + "by_type": dict(type_counts.most_common()), + "ideas": all_ideas, + } + +def get_timeline_data(db: Database) -> TimelineData: + """Return monthly counts by category for timeline chart.""" + pairs = db.drafts_with_ratings(limit=1000) + all_drafts = db.list_drafts(limit=1000, order_by="time ASC") + rating_map = {d.name: r for d, r in pairs} + + month_cat: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) + for d in all_drafts: + month = _extract_month(d.time) + r = rating_map.get(d.name) + if r: + cat = r.categories[0] if r.categories else "Other" + month_cat[month][cat] += 1 + + months = sorted(month_cat.keys()) + cat_totals: Counter = Counter() + for mc in month_cat.values(): + for c, cnt in mc.items(): + cat_totals[c] += cnt + top_cats = [c for c, _ in cat_totals.most_common(10)] + + series = {} + for cat in top_cats: + series[cat] = [month_cat[m].get(cat, 0) for m in months] + + return {"months": months, "series": series, "categories": top_cats} + +def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: + """Return draft similarity network (cached).""" + return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold)) + +def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: + """Return draft similarity network for force-directed graph. + + Returns {nodes: [{name, title, category, score}], + edges: [{source, target, similarity}], + stats: {node_count, edge_count, avg_similarity}} + """ + + + embeddings = db.all_embeddings() + if len(embeddings) < 2: + return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}} + + pairs = db.drafts_with_ratings(limit=1000) + rating_map = {d.name: r for d, r in pairs} + draft_map = {d.name: d for d, _ in pairs} + + # Filter to drafts with both embeddings and ratings + names = [n for n in embeddings if n in rating_map] + if len(names) < 2: + return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}} + + matrix = np.array([embeddings[n] for n in names]) + + # L2-normalize and compute cosine similarity + norms = np.linalg.norm(matrix, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + normalized = matrix / norms + sim_matrix = normalized @ normalized.T + + # Find pairs above threshold (upper triangle only) + edges = [] + node_set = set() + for i in range(len(names)): + for j in range(i + 1, len(names)): + sim = float(sim_matrix[i, j]) + if sim >= threshold: + edges.append({"source": names[i], "target": names[j], "similarity": round(sim, 4)}) + node_set.add(names[i]) + node_set.add(names[j]) + + # Build nodes from connected drafts only + nodes = [] + for name in names: + if name not in node_set: + continue + r = rating_map[name] + d = draft_map.get(name) + nodes.append({ + "name": name, + "title": d.title if d else name, + "category": r.categories[0] if r.categories else "Other", + "score": round(r.composite_score, 2), + }) + + avg_sim = round(sum(e["similarity"] for e in edges) / max(len(edges), 1), 4) + + return { + "nodes": nodes, + "edges": edges, + "stats": {"node_count": len(nodes), "edge_count": len(edges), "avg_similarity": avg_sim}, + } + +def get_idea_clusters(db: Database) -> dict: + """Cluster ideas (cached for 5 min).""" + return _cached("idea_clusters", lambda: _compute_idea_clusters(db)) + +def _compute_idea_clusters(db: Database) -> dict: + """Cluster ideas by embedding similarity, return clusters + t-SNE scatter. + + Uses Ward linkage on L2-normalized embeddings (approximates cosine) with + a target of ~30 clusters for readable groupings. Enriches each cluster + with WG info and category breakdown. + """ + + + embeddings = db.all_idea_embeddings() + if not embeddings: + return {"clusters": [], "scatter": [], "stats": {"total": 0, "clustered": 0, "num_clusters": 0}, "empty": True} + + # Exclude ideas from false-positive drafts + fp_names = db.false_positive_names() + + # Fetch ideas with IDs for metadata lookup + rows = db.conn.execute("SELECT id, title, description, idea_type, draft_name FROM ideas").fetchall() + idea_map = {r["id"]: {"title": r["title"], "description": r["description"], + "type": r["idea_type"], "draft_name": r["draft_name"]} + for r in rows if r["draft_name"] not in fp_names} + + # Remove FP ideas from embeddings too + embeddings = {k: v for k, v in embeddings.items() if k in idea_map} + + # Draft -> WG and category lookup + draft_rows = db.conn.execute('SELECT name, "group", title FROM drafts').fetchall() + draft_wg = {r["name"]: r["group"] or "none" for r in draft_rows} + draft_title_map = {r["name"]: r["title"] for r in draft_rows} + rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings WHERE COALESCE(false_positive, 0) = 0").fetchall() + draft_cats: dict[str, list[str]] = {} + for r in rating_rows: + try: + draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else [] + except (json.JSONDecodeError, TypeError): + draft_cats[r["draft_name"]] = [] + + # Build matrix from embeddings that have matching ideas + idea_ids = [iid for iid in embeddings if iid in idea_map] + if len(idea_ids) < 5: + return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} + + matrix = np.array([embeddings[iid] for iid in idea_ids]) + matrix_norm = sk_normalize(matrix) + + # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size + n_target = max(10, min(40, len(idea_ids) // 12)) + try: + clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward') + labels = clustering.fit_predict(matrix_norm) + except Exception: + return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} + + # Build cluster data + cluster_ideas_map: dict[int, list] = defaultdict(list) + for idx, iid in enumerate(idea_ids): + cluster_ideas_map[labels[idx]].append(iid) + + stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with", + "on", "by", "is", "as", "at", "from", "that", "this", "it", + "based", "using", "protocol", "mechanism", "framework", "system", + "network", "agent", "agents"} + clusters = [] + for cid in sorted(cluster_ideas_map.keys()): + members = cluster_ideas_map[cid] + ideas_in_cluster = [idea_map[iid] for iid in members if iid in idea_map] + if len(ideas_in_cluster) < 2: + continue + + # Theme: most common significant words in titles + words = Counter() + for idea in ideas_in_cluster: + for w in idea["title"].lower().split(): + w_clean = w.strip("()[].,;:-\"'") + if len(w_clean) > 2 and w_clean not in stop: + words[w_clean] += 1 + top_words = [w for w, _ in words.most_common(4)] + theme = " ".join(top_words).title() if top_words else f"Cluster {cid}" + + drafts = list({idea["draft_name"] for idea in ideas_in_cluster}) + + # Enrich: WG breakdown + wg_counts: dict[str, int] = Counter() + cat_counts: dict[str, int] = Counter() + for dname in drafts: + wg = draft_wg.get(dname, "none") + wg_counts[wg] += 1 + for cat in draft_cats.get(dname, []): + cat_counts[cat] += 1 + + wg_list = [{"wg": wg, "count": cnt} for wg, cnt in wg_counts.most_common(5)] + cat_list = [{"cat": cat, "count": cnt} for cat, cnt in cat_counts.most_common(3)] + cross_wg = len([w for w in wg_counts if w != "none"]) >= 2 + + clusters.append({ + "id": len(clusters), + "theme": theme, + "size": len(ideas_in_cluster), + "ideas": ideas_in_cluster[:20], + "drafts": drafts, + "wgs": wg_list, + "categories": cat_list, + "cross_wg": cross_wg, + "wg_count": len(wg_counts), + }) + + clusters.sort(key=lambda c: c["size"], reverse=True) + + # Build mapping: original cluster label -> sorted index + # Each cluster remembers which original label it came from via its member ids + old_label_to_new: dict[int, int] = {} + for new_idx, c in enumerate(clusters): + c["id"] = new_idx + # Find original label for any member of this cluster + for old_cid, members in cluster_ideas_map.items(): + if members and members[0] in [iid for iid in members if iid in idea_map]: + member_titles = {idea_map[m]["title"] for m in members if m in idea_map} + c_titles = {idea["title"] for idea in c["ideas"]} + if member_titles == c_titles or (member_titles & c_titles and len(members) == c["size"]): + old_label_to_new[old_cid] = new_idx + break + + # Fallback: build from idea_id -> label mapping + iid_to_new: dict[int, int] = {} + for old_cid, members in cluster_ideas_map.items(): + new_idx = old_label_to_new.get(old_cid, old_cid) + for iid in members: + iid_to_new[iid] = new_idx + + # t-SNE for scatter + scatter = [] + try: + perp = min(30, len(idea_ids) - 1) + tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500) + coords = tsne.fit_transform(matrix_norm) + + for idx, iid in enumerate(idea_ids): + info = idea_map.get(iid, {}) + scatter.append({ + "x": round(float(coords[idx, 0]), 3), + "y": round(float(coords[idx, 1]), 3), + "cluster_id": iid_to_new.get(iid, int(labels[idx])), + "title": info.get("title", ""), + "draft_name": info.get("draft_name", ""), + "wg": draft_wg.get(info.get("draft_name", ""), ""), + }) + except Exception: + pass + + # --- Cross-cluster links --- + # Find pairs of clusters whose ideas are semantically related + # Use centroid similarity + best idea-pair links + links = [] + if len(clusters) >= 2: + # Build cluster centroids from normalized embeddings + cluster_centroids = {} + cluster_member_indices: dict[int, list[int]] = defaultdict(list) + for idx, iid in enumerate(idea_ids): + cid = iid_to_new.get(iid, int(labels[idx])) + cluster_member_indices[cid].append(idx) + + for cid, indices in cluster_member_indices.items(): + if indices: + centroid = matrix_norm[indices].mean(axis=0) + norm = np.linalg.norm(centroid) + if norm > 0: + cluster_centroids[cid] = centroid / norm + + # Compute pairwise centroid similarity for all cluster pairs + cids_sorted = sorted(cluster_centroids.keys()) + for ci_idx, ci in enumerate(cids_sorted): + for cj in cids_sorted[ci_idx + 1:]: + sim = float(np.dot(cluster_centroids[ci], cluster_centroids[cj])) + if sim < 0.45: + continue + + # Find the best idea pair across these two clusters + best_sim = 0.0 + best_pair = (None, None) + # Sample up to 20 ideas per cluster to keep it fast + ci_members = cluster_member_indices[ci][:20] + cj_members = cluster_member_indices[cj][:20] + for mi in ci_members: + for mj in cj_members: + pair_sim = float(np.dot(matrix_norm[mi], matrix_norm[mj])) + if pair_sim > best_sim: + best_sim = pair_sim + best_pair = (idea_ids[mi], idea_ids[mj]) + + if best_sim < 0.5: + continue + + # Get theme names + ci_theme = next((c["theme"] for c in clusters if c["id"] == ci), f"Cluster {ci}") + cj_theme = next((c["theme"] for c in clusters if c["id"] == cj), f"Cluster {cj}") + + idea_a = idea_map.get(best_pair[0], {}) + idea_b = idea_map.get(best_pair[1], {}) + + links.append({ + "source": ci, + "target": cj, + "source_theme": ci_theme, + "target_theme": cj_theme, + "similarity": round(sim, 3), + "best_pair_sim": round(best_sim, 3), + "idea_a": idea_a.get("title", ""), + "idea_a_draft": idea_a.get("draft_name", ""), + "idea_b": idea_b.get("title", ""), + "idea_b_draft": idea_b.get("draft_name", ""), + }) + + links.sort(key=lambda l: l["best_pair_sim"], reverse=True) + links = links[:50] # cap at top 50 links + + total = len(idea_ids) + clustered = sum(c["size"] for c in clusters) + return { + "clusters": clusters, + "scatter": scatter, + "links": links, + "stats": {"total": total, "clustered": clustered, "num_clusters": len(clusters)}, + "empty": False, + } + +def get_timeline_animation_data(db: Database) -> dict: + """Timeline animation (cached for 5 min).""" + return _cached("timeline_animation", lambda: _compute_timeline_animation_data(db)) + +def _compute_timeline_animation_data(db: Database) -> dict: + """Compute t-SNE on all drafts, return points with month info + category_monthly. + + t-SNE is computed once on ALL drafts so coordinates are stable across + animation frames. Each point carries a ``month`` field (YYYY-MM) so the + front-end can build cumulative animation frames. + """ + + + embeddings = db.all_embeddings() + if len(embeddings) < 5: + return {"points": [], "months": [], "category_monthly": {}} + + pairs = db.drafts_with_ratings(limit=1000) + rating_map = {d.name: r for d, r in pairs} + draft_map = {d.name: d for d, _ in pairs} + + # Filter to drafts that have both embeddings and ratings + names = [n for n in embeddings if n in rating_map] + if len(names) < 5: + return {"points": [], "months": [], "category_monthly": {}} + + matrix = np.array([embeddings[n] for n in names]) + + try: + tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), + random_state=42, max_iter=500) + coords = tsne.fit_transform(matrix) + except Exception: + return {"points": [], "months": [], "category_monthly": {}} + + # Build points with month + points = [] + month_set: set[str] = set() + category_monthly: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) + + for i, name in enumerate(names): + r = rating_map[name] + d = draft_map.get(name) + month = _extract_month(d.time if d else None) + cat = r.categories[0] if r.categories else "Other" + month_set.add(month) + category_monthly[month][cat] += 1 + points.append({ + "name": name, + "title": d.title if d else name, + "x": round(float(coords[i, 0]), 3), + "y": round(float(coords[i, 1]), 3), + "category": cat, + "score": round(r.composite_score, 2), + "month": month, + }) + + months = sorted(month_set) + # Convert defaultdict to plain dict for JSON + cat_monthly_plain = {m: dict(cats) for m, cats in category_monthly.items()} + + return { + "points": points, + "months": months, + "category_monthly": cat_monthly_plain, + } + +def get_monitor_status(db: Database) -> MonitorStatus: + """Return monitoring status data for dashboard.""" + runs = db.get_monitor_runs(limit=20) + last = runs[0] if runs else None + total_drafts = db.count_drafts() + rated_count = len(db.drafts_with_ratings(limit=10000)) + unrated = len(db.unrated_drafts(limit=9999)) + unembedded = len(db.drafts_without_embeddings(limit=9999)) + embedded_count = total_drafts - unembedded + no_ideas = len(db.drafts_without_ideas(limit=9999)) + ideas_count = total_drafts - no_ideas + idea_total = db.idea_count() + gap_count = len(db.all_gaps()) + input_tok, output_tok = db.total_tokens_used() + + # Estimate cost (Sonnet pricing: $3/M input, $15/M output) + est_cost = (input_tok * 3.0 / 1_000_000) + (output_tok * 15.0 / 1_000_000) + + return { + "last_run": last, + "runs": runs, + "unprocessed": {"unrated": unrated, "unembedded": unembedded, "no_ideas": no_ideas}, + "total_runs": len(runs), + "pipeline": { + "total_drafts": total_drafts, + "rated": rated_count, + "embedded": embedded_count, + "with_ideas": ideas_count, + "idea_total": idea_total, + "gap_count": gap_count, + }, + "cost": { + "input_tokens": input_tok, + "output_tokens": output_tok, + "estimated_usd": round(est_cost, 2), + }, + } + +def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: + """Return citation graph (cached for 5 min).""" + return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs)) + +def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: + """Return citation network data for force-directed graph. + + Returns {nodes: [{id, type, title, influence, ...}], + edges: [{source, target}], + stats: {node_count, edge_count, ...}} + """ + # Get all references + rows = db.conn.execute( + "SELECT draft_name, ref_type, ref_id FROM draft_refs" + ).fetchall() + + # Count in-degree for each referenced item + in_degree: dict[str, int] = Counter() + edges_raw = [] + for r in rows: + ref_key = f"{r['ref_type']}:{r['ref_id']}" + in_degree[ref_key] += 1 + edges_raw.append((r["draft_name"], ref_key)) + + # Also count drafts as source nodes + draft_out: dict[str, int] = Counter() + for draft_name, _ in edges_raw: + draft_out[draft_name] += 1 + + # Get draft titles for labeling + draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() + draft_titles = {r["name"]: r["title"] for r in draft_rows} + + # Get rating categories for draft coloring + rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() + draft_cats = {} + for r in rating_rows: + try: + cats = json.loads(r["categories"]) if r["categories"] else [] + draft_cats[r["draft_name"]] = cats[0] if cats else "Other" + except Exception: + draft_cats[r["draft_name"]] = "Other" + + # Filter: keep RFCs with min_refs+ references and all drafts that reference them + top_refs = {k: v for k, v in in_degree.items() if v >= min_refs} + + # Build node set + node_set = set() + filtered_edges = [] + for draft_name, ref_key in edges_raw: + if ref_key in top_refs: + node_set.add(draft_name) + node_set.add(ref_key) + filtered_edges.append({"source": draft_name, "target": ref_key}) + + # Limit to ~200 nodes max for readability + if len(node_set) > 250: + # Keep only refs with higher in-degree + sorted_refs = sorted(top_refs.items(), key=lambda x: x[1], reverse=True) + keep_refs = set(k for k, _ in sorted_refs[:80]) + node_set = set() + filtered_edges = [] + for draft_name, ref_key in edges_raw: + if ref_key in keep_refs: + node_set.add(draft_name) + node_set.add(ref_key) + filtered_edges.append({"source": draft_name, "target": ref_key}) + + # Build nodes + nodes = [] + for nid in node_set: + if ":" in nid and not nid.startswith("draft-"): + # It's a reference node (rfc:1234, bcp:14, etc.) + ref_type, ref_id = nid.split(":", 1) + influence = in_degree.get(nid, 0) + if ref_type == "rfc": + try: + title = f"RFC {int(ref_id)}" + except ValueError: + title = f"RFC {ref_id}" + else: + title = f"{ref_type.upper()} {ref_id}" + nodes.append({ + "id": nid, + "type": ref_type, + "title": title, + "influence": influence, + "ref_id": ref_id, + }) + else: + # It's a draft node + influence = in_degree.get(nid, 0) + draft_out.get(nid, 0) + nodes.append({ + "id": nid, + "type": "draft", + "title": draft_titles.get(nid, nid), + "influence": draft_out.get(nid, 0), + "category": draft_cats.get(nid, "Other"), + }) + + # Stats + rfc_count = sum(1 for n in nodes if n["type"] == "rfc") + draft_count = sum(1 for n in nodes if n["type"] == "draft") + + return { + "nodes": nodes, + "edges": filtered_edges, + "stats": { + "node_count": len(nodes), + "edge_count": len(filtered_edges), + "rfc_count": rfc_count, + "draft_count": draft_count, + }, + } + +def get_landscape_tsne(db: Database) -> list[dict]: + """Compute t-SNE (cached for 5 min).""" + return _cached("landscape_tsne", lambda: _compute_landscape_tsne(db)) + +def _compute_landscape_tsne(db: Database) -> list[dict]: + """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].""" + + + embeddings = db.all_embeddings() + if len(embeddings) < 5: + return [] + + pairs = db.drafts_with_ratings(limit=1000) + rating_map = {d.name: r for d, r in pairs} + draft_map = {d.name: d for d, _ in pairs} + + # Filter to drafts that have both embeddings and ratings + names = [n for n in embeddings if n in rating_map] + if len(names) < 5: + return [] + + matrix = np.array([embeddings[n] for n in names]) + + try: + tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), + random_state=42, max_iter=500) + coords = tsne.fit_transform(matrix) + except Exception: + return [] + + result = [] + for i, name in enumerate(names): + r = rating_map[name] + d = draft_map.get(name) + result.append({ + "name": name, + "title": d.title if d else name, + "x": round(float(coords[i, 0]), 3), + "y": round(float(coords[i, 1]), 3), + "category": r.categories[0] if r.categories else "Other", + "score": round(r.composite_score, 2), + }) + return result + +def get_comparison_data(db: Database, names: list[str]) -> dict | None: + """Get comparison data for a list of drafts. + + Returns { + drafts: [{name, title, abstract, rating, ideas, refs, ...}], + shared_ideas: [{title, drafts: [name,...]}], + unique_ideas: {name: [{title, description}]}, + shared_refs: [{type, id, drafts: [name,...]}], + unique_refs: {name: [{type, id}]}, + similarities: [{a, b, similarity}], + comparison_text: str | None, + } + """ + + + drafts_data = [] + all_ideas: dict[str, list[dict]] = {} + all_refs: dict[str, list[tuple[str, str]]] = {} + + for name in names: + detail = get_draft_detail(db, name) + if not detail: + continue + drafts_data.append(detail) + all_ideas[name] = detail.get("ideas", []) + all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])] + + if len(drafts_data) < 2: + return None + + # Find shared vs unique ideas (by title similarity) + idea_title_drafts: dict[str, list[str]] = {} + for name, ideas in all_ideas.items(): + for idea in ideas: + title_lower = idea["title"].lower().strip() + if title_lower not in idea_title_drafts: + idea_title_drafts[title_lower] = [] + idea_title_drafts[title_lower].append(name) + + shared_ideas = [ + {"title": title, "drafts": draft_list} + for title, draft_list in idea_title_drafts.items() + if len(set(draft_list)) > 1 + ] + unique_ideas: dict[str, list[dict]] = {} + for name, ideas in all_ideas.items(): + unique = [] + for idea in ideas: + title_lower = idea["title"].lower().strip() + if len(set(idea_title_drafts.get(title_lower, []))) <= 1: + unique.append({"title": idea["title"], "description": idea.get("description", "")}) + unique_ideas[name] = unique + + # Find shared vs unique references + ref_drafts: dict[tuple[str, str], list[str]] = {} + for name, refs in all_refs.items(): + for ref in refs: + if ref not in ref_drafts: + ref_drafts[ref] = [] + ref_drafts[ref].append(name) + + shared_refs = [ + {"type": ref[0], "id": ref[1], "drafts": draft_list} + for ref, draft_list in ref_drafts.items() + if len(set(draft_list)) > 1 + ] + unique_refs: dict[str, list[dict]] = {} + for name, refs in all_refs.items(): + unique = [] + for ref in refs: + if len(set(ref_drafts.get(ref, []))) <= 1: + unique.append({"type": ref[0], "id": ref[1]}) + unique_refs[name] = unique + + # Pairwise embedding similarities + embeddings = db.all_embeddings() + similarities = [] + valid_names = [d["name"] for d in drafts_data] + for i in range(len(valid_names)): + for j in range(i + 1, len(valid_names)): + a, b = valid_names[i], valid_names[j] + if a in embeddings and b in embeddings: + vec_a = embeddings[a] + vec_b = embeddings[b] + dot = np.dot(vec_a, vec_b) + norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b) + sim = float(dot / norm) if norm > 0 else 0.0 + similarities.append({"a": a, "b": b, "similarity": round(sim, 4)}) + + return { + "drafts": drafts_data, + "shared_ideas": shared_ideas, + "unique_ideas": unique_ideas, + "shared_refs": shared_refs, + "unique_refs": unique_refs, + "similarities": similarities, + "comparison_text": None, + } + +def _classify_to_layer(text: str) -> str: + """Classify a piece of text to the best-matching architectural layer.""" + text_lower = text.lower() + words = set(re.findall(r"[a-z][a-z0-9-]+", text_lower)) + scores: dict[str, int] = {} + for layer_id, kws in _LAYER_KEYWORDS.items(): + scores[layer_id] = len(words & kws) + # Also check for multi-word keywords as substrings + for kw in kws: + if len(kw) > 4 and kw in text_lower: + scores[layer_id] += 1 + best = max(scores, key=lambda k: scores[k]) + return best if scores[best] > 0 else "communication" # default + +def get_architecture(db: Database) -> dict: + """Build system-of-systems architecture from idea clusters, gaps, and source coverage.""" + return _cached("architecture", lambda: _compute_architecture(db), ttl=600) + +def _compute_architecture(db: Database) -> dict: + """Compute the architecture view. + + Returns: + { + "components": [...], # architectural building blocks + "dependencies": [...], # edges between components + "gaps": [...], # gaps mapped to layers + "layers": [...], # layer definitions + "source_coverage": {...}, # per-layer source coverage + "stats": {...} + } + """ + # --- Gather raw data --- + cluster_data = get_idea_clusters(db) + clusters = cluster_data.get("clusters", []) + links = cluster_data.get("links", []) + all_gaps = db.all_gaps() + + # Source coverage: count drafts per source per layer + draft_rows = db.conn.execute( + "SELECT d.name, d.title, d.abstract, d.source, r.categories " + "FROM drafts d LEFT JOIN ratings r ON d.name = r.draft_name " + "WHERE COALESCE(r.false_positive, 0) = 0" + ).fetchall() + + # Build components from idea clusters + components = [] + cluster_to_component: dict[int, int] = {} # cluster_id -> component index + + for cl in clusters: + if cl["size"] < 3: + continue # skip tiny clusters + + # Determine layer from cluster theme + idea titles + text_blob = cl.get("theme", "") + for idea in cl.get("ideas", [])[:10]: + text_blob += " " + idea.get("title", "") + " " + idea.get("description", "") + layer = _classify_to_layer(text_blob) + + # Source coverage for this component's drafts + draft_names = set(cl.get("drafts", [])) + sources: Counter = Counter() + comp_drafts: list[dict] = [] + for dr in draft_rows: + if dr["name"] in draft_names: + sources[dr["source"] or "ietf"] += 1 + comp_drafts.append({"name": dr["name"], "title": (dr["title"] or dr["name"])[:80], "source": dr["source"] or "ietf"}) + + # Idea type breakdown + type_counts: Counter = Counter() + for idea in cl.get("ideas", []): + t = idea.get("type", "") + if t: + type_counts[t] += 1 + + # Maturity: rough proxy from idea count and source diversity + maturity = min(5, 1 + len(sources) + (1 if cl["size"] >= 10 else 0) + (1 if cl.get("cross_wg") else 0)) + + comp = { + "id": len(components), + "cluster_id": cl["id"], + "name": cl.get("theme", f"Component {cl['id']}"), + "layer": layer, + "size": cl["size"], + "draft_count": len(draft_names), + "drafts": comp_drafts[:20], + "sources": dict(sources.most_common()), + "type_breakdown": dict(type_counts.most_common(5)), + "maturity": maturity, + "wgs": cl.get("wgs", [])[:3], + "top_ideas": [{"title": i["title"], "type": i.get("type", ""), "draft_name": i.get("draft_name", "")} + for i in cl.get("ideas", [])[:5]], + "categories": cl.get("categories", []), + } + cluster_to_component[cl["id"]] = comp["id"] + components.append(comp) + + # Build dependencies from cross-cluster links + dependencies = [] + for link in links: + src_comp = cluster_to_component.get(link["source"]) + tgt_comp = cluster_to_component.get(link["target"]) + if src_comp is not None and tgt_comp is not None and src_comp != tgt_comp: + dependencies.append({ + "source": src_comp, + "target": tgt_comp, + "similarity": link.get("best_pair_sim", link.get("similarity", 0)), + "idea_a": link.get("idea_a", ""), + "idea_b": link.get("idea_b", ""), + }) + + # Map gaps to layers + gap_items = [] + for gap in all_gaps: + text = gap["topic"] + " " + gap.get("description", "") + " " + gap.get("category", "") + layer = _classify_to_layer(text) + gap_items.append({ + "id": gap["id"], + "topic": gap["topic"], + "description": gap["description"], + "evidence": gap.get("evidence", ""), + "severity": gap.get("severity", "medium"), + "category": gap.get("category", ""), + "layer": layer, + }) + + # Source coverage per layer + source_coverage: dict[str, dict[str, int]] = {l["id"]: Counter() for l in _ARCH_LAYERS} + for dr in draft_rows: + text = (dr["title"] or "") + " " + (dr["abstract"] or "")[:200] + layer = _classify_to_layer(text) + source_coverage[layer][dr["source"] or "ietf"] += 1 + # Convert Counters to dicts + source_coverage = {k: dict(v) for k, v in source_coverage.items()} + + # Layer summary stats + layer_info = [] + for l in _ARCH_LAYERS: + lid = l["id"] + comp_count = sum(1 for c in components if c["layer"] == lid) + idea_count = sum(c["size"] for c in components if c["layer"] == lid) + gap_count = sum(1 for g in gap_items if g["layer"] == lid) + layer_info.append({ + "id": l["id"], + "label": l["label"], + "order": l["order"], + "component_count": comp_count, + "idea_count": idea_count, + "gap_count": gap_count, + "coverage": source_coverage.get(lid, {}), + "total_drafts": sum(source_coverage.get(lid, {}).values()), + }) + + return { + "components": components, + "dependencies": dependencies, + "gaps": gap_items, + "layers": layer_info, + "stats": { + "total_components": len(components), + "total_dependencies": len(dependencies), + "total_gaps": len(gap_items), + "layers_with_gaps": len(set(g["layer"] for g in gap_items)), + }, + } + +def get_idea_analysis(db: Database) -> dict: + """Return comprehensive idea analysis data for the idea-analysis page. + + Includes novelty distribution, type breakdown with avg novelty, + top novel ideas, ideas-per-draft distribution, cross-tab of type x source, + shared ideas across drafts, and idea novelty vs draft rating correlation. + """ + from collections import Counter, defaultdict + from difflib import SequenceMatcher + + # Fetch raw data + all_ideas = db.conn.execute( + """SELECT i.id, i.draft_name, i.title, i.description, i.idea_type, + i.novelty_score + FROM ideas i ORDER BY i.novelty_score DESC NULLS LAST""" + ).fetchall() + all_ideas = [dict(r) for r in all_ideas] + + # Draft ratings lookup + ratings_rows = db.conn.execute( + """SELECT d.name, d.title as draft_title, d.source, + r.novelty AS r_novelty, r.maturity, r.overlap, r.momentum, r.relevance + FROM drafts d LEFT JOIN ratings r ON d.name = r.draft_name""" + ).fetchall() + draft_info = {} + for r in ratings_rows: + row = dict(r) + # Compute composite score (average of 5 dimensions) + dims = [row.get("r_novelty"), row.get("maturity"), row.get("overlap"), + row.get("momentum"), row.get("relevance")] + valid = [d for d in dims if d is not None] + row["composite_score"] = sum(valid) / len(valid) if valid else None + draft_info[row["name"]] = row + + total = len(all_ideas) + scored = [i for i in all_ideas if i.get("novelty_score") is not None] + unscored = total - len(scored) + avg_novelty = sum(i["novelty_score"] for i in scored) / len(scored) if scored else 0 + + # Embedding coverage + embed_count = db.conn.execute("SELECT COUNT(*) FROM idea_embeddings").fetchone()[0] + + # --- Novelty score distribution (histogram) --- + novelty_dist = Counter(i["novelty_score"] for i in scored) + novelty_histogram = { + "labels": [1, 2, 3, 4, 5], + "values": [novelty_dist.get(s, 0) for s in [1, 2, 3, 4, 5]], + } + + # --- Ideas by type with counts and avg novelty --- + type_data = defaultdict(lambda: {"count": 0, "novelty_sum": 0, "novelty_n": 0}) + for idea in all_ideas: + t = idea.get("idea_type") or "other" + type_data[t]["count"] += 1 + if idea.get("novelty_score") is not None: + type_data[t]["novelty_sum"] += idea["novelty_score"] + type_data[t]["novelty_n"] += 1 + + by_type = [] + for t, d in sorted(type_data.items(), key=lambda x: x[1]["count"], reverse=True): + avg = d["novelty_sum"] / d["novelty_n"] if d["novelty_n"] > 0 else 0 + by_type.append({"type": t, "count": d["count"], "avg_novelty": round(avg, 2)}) + + type_names = [t["type"] for t in by_type] + + # --- Top 20 most novel ideas (score 4-5) --- + top_novel = [] + for idea in all_ideas: + if idea.get("novelty_score") and idea["novelty_score"] >= 4: + di = draft_info.get(idea["draft_name"], {}) + top_novel.append({ + "title": idea["title"], + "description": idea["description"], + "type": idea.get("idea_type", "other"), + "novelty_score": idea["novelty_score"], + "draft_name": idea["draft_name"], + "draft_title": di.get("draft_title", ""), + "draft_score": di.get("composite_score"), + }) + top_novel.sort(key=lambda x: (x["novelty_score"], x.get("draft_score") or 0), reverse=True) + top_novel = top_novel[:20] + + # --- Ideas per draft distribution --- + ideas_per_draft = Counter(i["draft_name"] for i in all_ideas) + ipd_dist = Counter(ideas_per_draft.values()) + ideas_per_draft_hist = { + "labels": sorted(ipd_dist.keys()), + "values": [ipd_dist[k] for k in sorted(ipd_dist.keys())], + } + # Also top drafts by idea count + top_idea_drafts = [] + for name, count in ideas_per_draft.most_common(10): + di = draft_info.get(name, {}) + top_idea_drafts.append({ + "name": name, + "draft_title": di.get("draft_title", ""), + "idea_count": count, + "score": di.get("composite_score"), + }) + + # --- Cross-tabulation: idea_type x source --- + type_source = defaultdict(lambda: defaultdict(int)) + for idea in all_ideas: + t = idea.get("idea_type") or "other" + di = draft_info.get(idea["draft_name"], {}) + source = di.get("source", "ietf") or "ietf" + type_source[t][source] += 1 + + sources = sorted(set( + di.get("source", "ietf") or "ietf" for di in draft_info.values() + )) + cross_tab = [] + for t in type_names: + row = {"type": t} + for s in sources: + row[s] = type_source[t].get(s, 0) + cross_tab.append(row) + + # --- Shared ideas across drafts --- + idea_groups: list[dict] = [] + for idea in all_ideas: + title_lower = idea["title"].lower().strip() + matched = False + for group in idea_groups: + ratio = SequenceMatcher(None, title_lower, group["canonical"]).ratio() + if ratio >= 0.75: + group["ideas"].append(idea) + group["drafts"].add(idea["draft_name"]) + matched = True + break + if not matched: + idea_groups.append({ + "canonical": title_lower, + "title": idea["title"], + "ideas": [idea], + "drafts": {idea["draft_name"]}, + }) + + shared_ideas = [] + for g in sorted(idea_groups, key=lambda x: len(x["drafts"]), reverse=True): + if len(g["drafts"]) < 2: + break + shared_ideas.append({ + "title": g["title"], + "appearances": len(g["drafts"]), + "drafts": sorted(g["drafts"])[:8], + "types": list(set(i.get("idea_type", "other") for i in g["ideas"])), + }) + + # --- Scatter: draft avg idea novelty vs draft relevance --- + draft_idea_novelty = defaultdict(list) + for idea in scored: + draft_idea_novelty[idea["draft_name"]].append(idea["novelty_score"]) + + scatter_data = [] + for name, scores in draft_idea_novelty.items(): + di = draft_info.get(name, {}) + if di.get("relevance") is not None and di.get("composite_score") is not None: + scatter_data.append({ + "name": name, + "avg_idea_novelty": round(sum(scores) / len(scores), 2), + "relevance": di["relevance"], + "score": di["composite_score"], + "idea_count": len(scores), + "source": di.get("source", "ietf") or "ietf", + }) + + # --- Sunburst data: type -> novelty band --- + sunburst_labels = [] + sunburst_parents = [] + sunburst_values = [] + # Root + sunburst_labels.append("All Ideas") + sunburst_parents.append("") + sunburst_values.append(total) + + novelty_bands = {"High (4-5)": lambda s: s is not None and s >= 4, + "Medium (3)": lambda s: s is not None and s == 3, + "Low (1-2)": lambda s: s is not None and s <= 2, + "Unscored": lambda s: s is None} + + for t_info in by_type: + t = t_info["type"] + sunburst_labels.append(t) + sunburst_parents.append("All Ideas") + sunburst_values.append(t_info["count"]) + # Sub-bands + type_ideas = [i for i in all_ideas if (i.get("idea_type") or "other") == t] + for band, fn in novelty_bands.items(): + cnt = sum(1 for i in type_ideas if fn(i.get("novelty_score"))) + if cnt > 0: + sunburst_labels.append(f"{t} - {band}") + sunburst_parents.append(t) + sunburst_values.append(cnt) + + return { + "total": total, + "scored": len(scored), + "unscored": unscored, + "avg_novelty": round(avg_novelty, 2), + "embed_count": embed_count, + "embed_pct": round(embed_count / total * 100, 1) if total > 0 else 0, + "type_count": len(by_type), + "novelty_histogram": novelty_histogram, + "by_type": by_type, + "top_novel": top_novel, + "ideas_per_draft_hist": ideas_per_draft_hist, + "top_idea_drafts": top_idea_drafts, + "cross_tab": cross_tab, + "sources": sources, + "shared_ideas": shared_ideas, + "scatter_data": scatter_data, + "sunburst": { + "labels": sunburst_labels, + "parents": sunburst_parents, + "values": sunburst_values, + }, + } + +def get_trends_data(db: Database) -> dict: + """Return temporal evolution data for the /trends page. + + Returns dict with: + - monthly_submissions: [{month, source, count}, ...] + - monthly_ratings: [{month, novelty, maturity, overlap, momentum, relevance}, ...] + - monthly_categories: [{month, category, count}, ...] + - safety_ratio: [{month, safety, capability, ratio}, ...] + - cumulative_ideas: [{month, total}, ...] + - monthly_new_authors: [{month, count}, ...] + - stats: {fastest_growing, newest_active} + - monthly_table: [{month, total, sources: {}, avg_score}, ...] + """ + conn = db.conn + + # 1. Monthly submissions by source + rows = conn.execute(""" + SELECT substr(time, 1, 7) AS month, source, COUNT(*) AS cnt + FROM drafts + WHERE time IS NOT NULL AND time != '' + GROUP BY month, source + ORDER BY month + """).fetchall() + monthly_submissions = [{"month": r["month"], "source": r["source"], "count": r["cnt"]} for r in rows] + + # 2. Monthly average ratings (all 5 dimensions) + rows = conn.execute(""" + SELECT substr(d.time, 1, 7) AS month, + AVG(r.novelty) AS novelty, AVG(r.maturity) AS maturity, + AVG(r.overlap) AS overlap, AVG(r.momentum) AS momentum, + AVG(r.relevance) AS relevance, + COUNT(*) AS cnt + FROM drafts d + JOIN ratings r ON d.name = r.draft_name + WHERE d.time IS NOT NULL AND d.time != '' AND r.false_positive = 0 + GROUP BY month + ORDER BY month + """).fetchall() + monthly_ratings = [{ + "month": r["month"], + "novelty": round(r["novelty"], 2), + "maturity": round(r["maturity"], 2), + "overlap": round(r["overlap"], 2), + "momentum": round(r["momentum"], 2), + "relevance": round(r["relevance"], 2), + "count": r["cnt"], + } for r in rows] + + # 3. Monthly category distribution + rows = conn.execute(""" + SELECT substr(d.time, 1, 7) AS month, r.categories + FROM drafts d + JOIN ratings r ON d.name = r.draft_name + WHERE d.time IS NOT NULL AND d.time != '' AND r.false_positive = 0 + """).fetchall() + cat_monthly: dict[str, Counter] = defaultdict(Counter) + all_cats: Counter = Counter() + for r in rows: + month = r["month"] + try: + cats = json.loads(r["categories"]) if r["categories"] else [] + except (json.JSONDecodeError, TypeError): + cats = [] + for c in cats: + cat_monthly[month][c] += 1 + all_cats[c] += 1 + + # Top 8 categories + top_cats = [c for c, _ in all_cats.most_common(8)] + months_sorted = sorted(cat_monthly.keys()) + monthly_categories = [] + for month in months_sorted: + for cat in top_cats: + monthly_categories.append({ + "month": month, + "category": cat, + "count": cat_monthly[month].get(cat, 0), + }) + + # 4. Safety ratio over time + safety_ratio = [] + for month in months_sorted: + safety = sum(cat_monthly[month].get(c, 0) for c in SAFETY_CATEGORIES) + capability = sum(cat_monthly[month].get(c, 0) for c in CAPABILITY_CATEGORIES) + ratio = round(safety / capability, 2) if capability > 0 else 0 + safety_ratio.append({ + "month": month, + "safety": safety, + "capability": capability, + "ratio": ratio, + }) + + # 5. Cumulative idea count over time + rows = conn.execute(""" + SELECT substr(d.time, 1, 7) AS month, COUNT(i.id) AS cnt + FROM ideas i + JOIN drafts d ON i.draft_name = d.name + WHERE d.time IS NOT NULL AND d.time != '' + GROUP BY month + ORDER BY month + """).fetchall() + cumulative = 0 + cumulative_ideas = [] + for r in rows: + cumulative += r["cnt"] + cumulative_ideas.append({"month": r["month"], "total": cumulative}) + + # 6. Monthly new author count (first-time contributors) + rows = conn.execute(""" + SELECT da.person_id, MIN(substr(d.time, 1, 7)) AS first_month + FROM draft_authors da + JOIN drafts d ON da.draft_name = d.name + WHERE d.time IS NOT NULL AND d.time != '' + GROUP BY da.person_id + """).fetchall() + new_author_monthly: Counter = Counter() + for r in rows: + if r["first_month"]: + new_author_monthly[r["first_month"]] += 1 + monthly_new_authors = [ + {"month": m, "count": new_author_monthly.get(m, 0)} + for m in months_sorted + ] + + # 7. Stats: fastest growing category, newest active category + fastest_growing = "" + newest_active = "" + if len(months_sorted) >= 4: + mid = len(months_sorted) // 2 + early_months = months_sorted[:mid] + late_months = months_sorted[mid:] + best_growth = -999 + for cat in top_cats: + early = sum(cat_monthly[m].get(cat, 0) for m in early_months) + late = sum(cat_monthly[m].get(cat, 0) for m in late_months) + if early > 0: + growth = (late - early) / early + elif late > 0: + growth = float("inf") + else: + growth = 0 + if growth > best_growth: + best_growth = growth + fastest_growing = cat + + # Newest active: category with latest first appearance + cat_first_month: dict[str, str] = {} + for month in months_sorted: + for cat in all_cats: + if cat not in cat_first_month and cat_monthly[month].get(cat, 0) > 0: + cat_first_month[cat] = month + if cat_first_month: + newest_active = max(cat_first_month, key=lambda c: cat_first_month[c]) + + # 8. Monthly breakdown table + monthly_table = [] + for month in months_sorted: + # Get per-source counts + sources: dict[str, int] = {} + total = 0 + for s in monthly_submissions: + if s["month"] == month: + sources[s["source"]] = s["count"] + total += s["count"] + # Get avg score + avg_row = conn.execute(""" + SELECT AVG((r.novelty + r.maturity + r.overlap + r.momentum + r.relevance) / 5.0) AS avg_score + FROM drafts d JOIN ratings r ON d.name = r.draft_name + WHERE substr(d.time, 1, 7) = ? AND r.false_positive = 0 + """, (month,)).fetchone() + avg_score = round(avg_row["avg_score"], 2) if avg_row and avg_row["avg_score"] else 0 + monthly_table.append({ + "month": month, + "total": total, + "sources": sources, + "avg_score": avg_score, + }) + + return { + "monthly_submissions": monthly_submissions, + "monthly_ratings": monthly_ratings, + "monthly_categories": monthly_categories, + "safety_ratio": safety_ratio, + "cumulative_ideas": cumulative_ideas, + "monthly_new_authors": monthly_new_authors, + "top_categories": top_cats, + "months": months_sorted, + "stats": { + "fastest_growing": fastest_growing, + "newest_active": newest_active, + }, + "monthly_table": monthly_table, + } + +def get_complexity_data(db: Database) -> dict: + """Return draft complexity analysis data for the /complexity page. + + For each rated draft, compute structural complexity metrics and + correlate with rating dimensions. + + Returns dict with: + - drafts: [{name, title, pages, author_count, citation_count, idea_count, + category_count, novelty, maturity, overlap, momentum, relevance, + score, composite_complexity}, ...] + - correlations: {metric: {dimension: r_value}} + - top_complex: top 10 most complex drafts + - top_efficient: top 10 high-rating low-complexity drafts + - stats: {avg_pages, avg_authors, avg_citations, pages_coverage_pct} + - category_complexity: [{category, avg_pages, avg_authors, avg_citations, count}, ...] + - source_complexity: [{source, avg_pages, avg_authors, avg_citations, count}, ...] + """ + conn = db.conn + + # Build per-draft complexity data + rows = conn.execute(""" + SELECT d.name, d.title, d.pages, d.source, + r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, + r.categories, + (r.novelty + r.maturity + r.overlap + r.momentum + r.relevance) / 5.0 AS score + FROM drafts d + JOIN ratings r ON d.name = r.draft_name + WHERE r.false_positive = 0 + """).fetchall() + + # Author counts + author_counts = db.draft_author_count_map() + + # Citation counts (outgoing refs) + citation_counts = {} + for row in conn.execute(""" + SELECT draft_name, COUNT(*) AS cnt FROM draft_refs GROUP BY draft_name + """).fetchall(): + citation_counts[row["draft_name"]] = row["cnt"] + + # Idea counts + idea_counts = {} + for row in conn.execute(""" + SELECT draft_name, COUNT(*) AS cnt FROM ideas GROUP BY draft_name + """).fetchall(): + idea_counts[row["draft_name"]] = row["cnt"] + + drafts_data = [] + total_with_pages = 0 + total_drafts = 0 + for r in rows: + total_drafts += 1 + pages = r["pages"] + if pages is not None: + total_with_pages += 1 + try: + cats = json.loads(r["categories"]) if r["categories"] else [] + except (json.JSONDecodeError, TypeError): + cats = [] + ac = author_counts.get(r["name"], 0) + cc = citation_counts.get(r["name"], 0) + ic = idea_counts.get(r["name"], 0) + cat_count = len(cats) + # Composite complexity: normalize each metric to 0-1 scale and average + # (raw values stored; composite calculated after we know max values) + drafts_data.append({ + "name": r["name"], + "title": r["title"], + "pages": pages, + "source": r["source"] or "ietf", + "author_count": ac, + "citation_count": cc, + "idea_count": ic, + "category_count": cat_count, + "categories": cats, + "novelty": r["novelty"], + "maturity": r["maturity"], + "overlap": r["overlap"], + "momentum": r["momentum"], + "relevance": r["relevance"], + "score": round(r["score"], 2), + }) + + # Compute composite complexity score (normalized 0-1 each, then averaged) + max_pages = max((d["pages"] for d in drafts_data if d["pages"] is not None), default=1) or 1 + max_authors = max((d["author_count"] for d in drafts_data), default=1) or 1 + max_citations = max((d["citation_count"] for d in drafts_data), default=1) or 1 + max_ideas = max((d["idea_count"] for d in drafts_data), default=1) or 1 + + for d in drafts_data: + p = (d["pages"] / max_pages) if d["pages"] is not None else 0.3 # default to median-ish + a = d["author_count"] / max_authors + c = d["citation_count"] / max_citations + i = d["idea_count"] / max_ideas + d["composite_complexity"] = round((p + a + c + i) / 4, 3) + + # Correlation matrix: complexity metrics vs rating dimensions + metrics = ["pages", "author_count", "citation_count", "idea_count", "category_count"] + dimensions = ["novelty", "maturity", "overlap", "momentum", "relevance"] + + def _pearson(xs: list[float], ys: list[float]) -> float: + """Compute Pearson correlation coefficient.""" + n = len(xs) + if n < 3: + return 0.0 + mean_x = sum(xs) / n + mean_y = sum(ys) / n + cov = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys)) + std_x = (sum((x - mean_x) ** 2 for x in xs)) ** 0.5 + std_y = (sum((y - mean_y) ** 2 for y in ys)) ** 0.5 + if std_x == 0 or std_y == 0: + return 0.0 + return round(cov / (std_x * std_y), 3) + + correlations: dict[str, dict[str, float]] = {} + for metric in metrics: + correlations[metric] = {} + for dim in dimensions: + if metric == "pages": + # Filter to drafts with pages data + pairs = [(d[metric], d[dim]) for d in drafts_data if d[metric] is not None] + else: + pairs = [(d[metric], d[dim]) for d in drafts_data] + if len(pairs) >= 3: + xs, ys = zip(*pairs) + correlations[metric][dim] = _pearson(list(xs), list(ys)) + else: + correlations[metric][dim] = 0.0 + + # Top 10 most complex + sorted_by_complexity = sorted(drafts_data, key=lambda d: d["composite_complexity"], reverse=True) + top_complex = sorted_by_complexity[:10] + + # Top 10 efficient: high score but low complexity + # Efficiency = score / (composite_complexity + 0.1) (avoid div by zero) + for d in drafts_data: + d["efficiency"] = round(d["score"] / (d["composite_complexity"] + 0.1), 2) + sorted_by_efficiency = sorted(drafts_data, key=lambda d: d["efficiency"], reverse=True) + top_efficient = sorted_by_efficiency[:10] + + # Stats + pages_vals = [d["pages"] for d in drafts_data if d["pages"] is not None] + avg_pages = round(sum(pages_vals) / len(pages_vals), 1) if pages_vals else 0 + avg_authors = round(sum(d["author_count"] for d in drafts_data) / len(drafts_data), 1) if drafts_data else 0 + avg_citations = round(sum(d["citation_count"] for d in drafts_data) / len(drafts_data), 1) if drafts_data else 0 + pages_coverage = round(total_with_pages / total_drafts * 100, 1) if total_drafts else 0 + + # Category complexity averages + cat_data: dict[str, list[dict]] = defaultdict(list) + for d in drafts_data: + for cat in d.get("categories", []): + cat_data[cat].append(d) + + category_complexity = [] + for cat, ds in sorted(cat_data.items(), key=lambda x: -len(x[1])): + p_vals = [d["pages"] for d in ds if d["pages"] is not None] + category_complexity.append({ + "category": cat, + "avg_pages": round(sum(p_vals) / len(p_vals), 1) if p_vals else 0, + "avg_authors": round(sum(d["author_count"] for d in ds) / len(ds), 1), + "avg_citations": round(sum(d["citation_count"] for d in ds) / len(ds), 1), + "avg_score": round(sum(d["score"] for d in ds) / len(ds), 2), + "count": len(ds), + }) + + # Source complexity + source_data: dict[str, list[dict]] = defaultdict(list) + for d in drafts_data: + source_data[d["source"]].append(d) + + source_complexity = [] + for src, ds in sorted(source_data.items(), key=lambda x: -len(x[1])): + p_vals = [d["pages"] for d in ds if d["pages"] is not None] + source_complexity.append({ + "source": src, + "avg_pages": round(sum(p_vals) / len(p_vals), 1) if p_vals else 0, + "avg_authors": round(sum(d["author_count"] for d in ds) / len(ds), 1), + "avg_citations": round(sum(d["citation_count"] for d in ds) / len(ds), 1), + "avg_score": round(sum(d["score"] for d in ds) / len(ds), 2), + "count": len(ds), + }) + + return { + "drafts": drafts_data, + "correlations": correlations, + "metrics": metrics, + "dimensions": dimensions, + "top_complex": top_complex, + "top_efficient": top_efficient, + "stats": { + "avg_pages": avg_pages, + "avg_authors": avg_authors, + "avg_citations": avg_citations, + "pages_coverage_pct": pages_coverage, + "total_drafts": total_drafts, + }, + "category_complexity": category_complexity, + "source_complexity": source_complexity, + } + +def get_source_comparison(db: Database) -> dict: + """Cross-source comparison: ratings, categories, counts by standards body.""" + pairs_all = db.drafts_with_ratings(limit=2000) + # Also include false positives for completeness of source counts + pairs_fp = db.drafts_with_ratings(limit=2000, include_false_positives=True) + + # Build per-source data + source_stats: dict[str, dict] = {} + source_categories: dict[str, Counter] = defaultdict(Counter) + source_ratings: dict[str, dict[str, list]] = defaultdict(lambda: { + "novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": [], "scores": [], + }) + # Collect author counts per source + all_authors_by_source: dict[str, set] = defaultdict(set) + + for draft, rating in pairs_all: + src = getattr(draft, "source", "ietf") or "ietf" + source_ratings[src]["novelty"].append(rating.novelty) + source_ratings[src]["maturity"].append(rating.maturity) + source_ratings[src]["overlap"].append(rating.overlap) + source_ratings[src]["momentum"].append(rating.momentum) + source_ratings[src]["relevance"].append(rating.relevance) + source_ratings[src]["scores"].append(round(rating.composite_score, 2)) + for cat in rating.categories: + source_categories[src][cat] += 1 + + # Get all drafts (including unrated) for draft counts + all_drafts = db.list_drafts(limit=5000) + source_draft_counts: Counter = Counter() + for d in all_drafts: + src = getattr(d, "source", "ietf") or "ietf" + source_draft_counts[src] += 1 + + # Author counts by source + try: + rows = db.conn.execute( + """SELECT d.source, COUNT(DISTINCT da.person_id) as author_count + FROM drafts d + JOIN draft_authors da ON d.name = da.draft_name + GROUP BY d.source""" + ).fetchall() + for r in rows: + src = r["source"] or "ietf" + all_authors_by_source[src] = r["author_count"] + except Exception: + pass + + # Idea counts by source + source_idea_counts: Counter = Counter() + try: + rows = db.conn.execute( + """SELECT d.source, COUNT(*) as idea_count + FROM ideas i + JOIN drafts d ON i.draft_name = d.name + GROUP BY d.source""" + ).fetchall() + for r in rows: + src = r["source"] or "ietf" + source_idea_counts[src] = r["idea_count"] + except Exception: + pass + + # Build summary table + all_sources = sorted(set(source_draft_counts.keys()) | set(source_ratings.keys())) + summary = [] + for src in all_sources: + rats = source_ratings.get(src, {"scores": []}) + cats = source_categories.get(src, Counter()) + top_cat = cats.most_common(1)[0][0] if cats else "N/A" + avg_score = round(sum(rats["scores"]) / len(rats["scores"]), 2) if rats["scores"] else 0.0 + summary.append({ + "source": src, + "drafts": source_draft_counts.get(src, 0), + "rated": len(rats["scores"]), + "authors": all_authors_by_source.get(src, 0), + "ideas": source_idea_counts.get(src, 0), + "avg_score": avg_score, + "top_category": top_cat, + }) + + # Radar data: average of each dimension per source + radar = {} + for src, rats in source_ratings.items(): + if not rats["scores"]: + continue + n = len(rats["scores"]) + radar[src] = { + "novelty": round(sum(rats["novelty"]) / n, 2), + "maturity": round(sum(rats["maturity"]) / n, 2), + "overlap": round(sum(rats["overlap"]) / n, 2), + "momentum": round(sum(rats["momentum"]) / n, 2), + "relevance": round(sum(rats["relevance"]) / n, 2), + "count": n, + } + + # Category distribution by source (for stacked bar / heatmap) + all_cats = sorted({cat for cats in source_categories.values() for cat in cats}) + heatmap = { + "sources": list(source_categories.keys()), + "categories": all_cats, + "values": [], + } + for src in heatmap["sources"]: + row = [source_categories[src].get(cat, 0) for cat in all_cats] + heatmap["values"].append(row) + + # Unique/shared categories analysis + source_cat_sets = {src: set(cats.keys()) for src, cats in source_categories.items()} + unique_cats = {} + for src, cats in source_cat_sets.items(): + others = set() + for s2, c2 in source_cat_sets.items(): + if s2 != src: + others |= c2 + unique_cats[src] = sorted(cats - others) + + shared_cats = set() + for src, cats in source_cat_sets.items(): + for s2, c2 in source_cat_sets.items(): + if s2 != src: + shared_cats |= (cats & c2) + shared_cats = sorted(shared_cats) + + return { + "summary": summary, + "radar": radar, + "heatmap": heatmap, + "unique_categories": unique_cats, + "shared_categories": shared_cats, + } + +def get_citation_influence(db: Database) -> dict: + """Return citation influence analysis data (cached for 5 min).""" + return _cached("citation_influence", lambda: _compute_citation_influence(db)) + +def _compute_citation_influence(db: Database) -> dict: + """Compute citation influence metrics from the draft_refs table. + + Returns dict with: + - top_cited_rfcs: top 20 most-cited RFCs with citation counts and citing drafts + - top_citing_drafts: top 20 drafts that cite the most references + - citations_by_category: average citations per category + - stats: total citations, unique RFCs, avg refs per draft + - draft_network: draft-to-draft citation edges for visualization + """ + # Get all references + rows = db.conn.execute( + "SELECT draft_name, ref_type, ref_id FROM draft_refs" + ).fetchall() + + # Get draft titles and categories + draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() + draft_titles = {r["name"]: r["title"] for r in draft_rows} + + rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() + draft_cats: dict[str, str] = {} + for r in rating_rows: + try: + cats = json.loads(r["categories"]) if r["categories"] else [] + draft_cats[r["draft_name"]] = cats[0] if cats else "Other" + except Exception: + draft_cats[r["draft_name"]] = "Other" + + # Well-known RFC names + rfc_names = { + "2119": "Key words (MUST/SHALL/MAY)", "8174": "Key words update", + "8259": "JSON", "7519": "JWT", "6749": "OAuth 2.0", + "7540": "HTTP/2", "9110": "HTTP Semantics", "7525": "TLS Recommendations", + "8446": "TLS 1.3", "3986": "URIs", "7230": "HTTP/1.1 Syntax", + "7231": "HTTP/1.1 Semantics", "8288": "Web Linking", "6125": "TLS Server Identity", + "7515": "JWS", "7516": "JWE", "7517": "JWK", "7518": "JWA", + "9449": "DPoP", "6750": "OAuth Bearer", "8725": "JWT Best Practices", + "9396": "Rich Authorization Requests", "9101": "JAR", + "8414": "OAuth Server Metadata", "7591": "Dynamic Client Registration", + "8705": "mTLS for OAuth", "9068": "JWT Access Tokens", + "6819": "OAuth Threat Model", "9200": "ACE-OAuth", "9052": "COSE", + "8392": "CWT", "7252": "CoAP", + } + + # In-degree: how many times each RFC is cited + rfc_citations: dict[str, list[str]] = defaultdict(list) + draft_out_count: dict[str, int] = Counter() + draft_to_draft_edges = [] + total_citations = 0 + + for r in rows: + draft_name = r["draft_name"] + ref_type = r["ref_type"] + ref_id = r["ref_id"] + total_citations += 1 + draft_out_count[draft_name] += 1 + + if ref_type == "rfc": + rfc_citations[ref_id].append(draft_name) + elif ref_type == "draft": + draft_to_draft_edges.append({ + "source": draft_name, + "target": ref_id, + "source_title": draft_titles.get(draft_name, draft_name), + "target_title": draft_titles.get(ref_id, ref_id), + }) + + # Top 20 most-cited RFCs + rfc_sorted = sorted(rfc_citations.items(), key=lambda x: len(x[1]), reverse=True) + top_cited_rfcs = [] + for ref_id, citing_drafts in rfc_sorted[:20]: + top_cited_rfcs.append({ + "rfc_id": ref_id, + "name": rfc_names.get(ref_id, ""), + "count": len(citing_drafts), + "drafts": citing_drafts[:10], # Limit to first 10 for display + "total_drafts": len(citing_drafts), + }) + + # Top 20 most-citing drafts (out-degree) + draft_sorted = sorted(draft_out_count.items(), key=lambda x: x[1], reverse=True) + top_citing_drafts = [] + for draft_name, count in draft_sorted[:20]: + top_citing_drafts.append({ + "name": draft_name, + "title": draft_titles.get(draft_name, draft_name), + "count": count, + "category": draft_cats.get(draft_name, "Other"), + }) + + # Citation density by category + cat_totals: dict[str, int] = Counter() + cat_counts: dict[str, int] = Counter() + for draft_name, count in draft_out_count.items(): + cat = draft_cats.get(draft_name, "Other") + cat_totals[cat] += count + cat_counts[cat] += 1 + + citations_by_category = [] + for cat in sorted(cat_totals.keys()): + avg = cat_totals[cat] / cat_counts[cat] if cat_counts[cat] > 0 else 0 + citations_by_category.append({ + "category": cat, + "total_citations": cat_totals[cat], + "draft_count": cat_counts[cat], + "avg_citations": round(avg, 1), + }) + citations_by_category.sort(key=lambda x: x["avg_citations"], reverse=True) + + # PageRank-style influence: drafts that cite highly-cited RFCs + # Simple approximation: sum of (1 / citation_count) for each RFC cited + rfc_influence = {rid: len(drafts) for rid, drafts in rfc_citations.items()} + draft_pagerank: dict[str, float] = Counter() + for r in rows: + if r["ref_type"] == "rfc" and r["ref_id"] in rfc_influence: + # Higher score for citing highly-cited RFCs + draft_pagerank[r["draft_name"]] += rfc_influence[r["ref_id"]] + + pagerank_sorted = sorted(draft_pagerank.items(), key=lambda x: x[1], reverse=True) + top_pagerank = [] + for draft_name, score in pagerank_sorted[:20]: + top_pagerank.append({ + "name": draft_name, + "title": draft_titles.get(draft_name, draft_name), + "score": round(score, 1), + "category": draft_cats.get(draft_name, "Other"), + "out_degree": draft_out_count.get(draft_name, 0), + }) + + # Stats + unique_rfcs = len(rfc_citations) + drafts_with_refs = len(draft_out_count) + avg_refs = total_citations / drafts_with_refs if drafts_with_refs > 0 else 0 + + return { + "top_cited_rfcs": top_cited_rfcs, + "top_citing_drafts": top_citing_drafts, + "top_pagerank": top_pagerank, + "citations_by_category": citations_by_category, + "draft_network": draft_to_draft_edges[:200], # Limit for perf + "stats": { + "total_citations": total_citations, + "unique_rfcs": unique_rfcs, + "drafts_with_refs": drafts_with_refs, + "avg_refs_per_draft": round(avg_refs, 1), + }, + } + +def get_bcp_analysis(db: Database) -> dict: + """Return BCP dependency analysis data (cached for 5 min).""" + return _cached("bcp_analysis", lambda: _compute_bcp_analysis(db)) + +def _compute_bcp_analysis(db: Database) -> dict: + """Compute BCP dependency analysis. + + Returns dict with: + - bcps: all BCPs with citation counts and citing drafts + - co_citation: which BCPs tend to be co-cited + - by_category: BCP citation patterns by category + - coverage: what % of drafts cite at least one BCP + """ + # Get all BCP references + bcp_rows = db.conn.execute( + "SELECT draft_name, ref_id FROM draft_refs WHERE ref_type = 'bcp'" + ).fetchall() + + # Get draft titles and categories + draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() + draft_titles = {r["name"]: r["title"] for r in draft_rows} + total_drafts = len(draft_titles) + + rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() + draft_cats: dict[str, str] = {} + for r in rating_rows: + try: + cats = json.loads(r["categories"]) if r["categories"] else [] + draft_cats[r["draft_name"]] = cats[0] if cats else "Other" + except Exception: + draft_cats[r["draft_name"]] = "Other" + + # BCP citation counts + bcp_citations: dict[str, list[str]] = defaultdict(list) + draft_bcps: dict[str, list[str]] = defaultdict(list) + + for r in bcp_rows: + bcp_citations[r["ref_id"]].append(r["draft_name"]) + draft_bcps[r["draft_name"]].append(r["ref_id"]) + + # All BCPs with counts + bcps = [] + for bcp_id, citing_drafts in sorted(bcp_citations.items(), + key=lambda x: len(x[1]), reverse=True): + bcps.append({ + "bcp_id": bcp_id, + "count": len(citing_drafts), + "drafts": citing_drafts[:10], + "total_drafts": len(citing_drafts), + }) + + # Co-citation matrix: which BCPs appear together in the same draft + bcp_ids = sorted(bcp_citations.keys()) + co_citation = [] + for i, bcp_a in enumerate(bcp_ids): + drafts_a = set(bcp_citations[bcp_a]) + for j, bcp_b in enumerate(bcp_ids): + if j <= i: + continue + drafts_b = set(bcp_citations[bcp_b]) + shared = len(drafts_a & drafts_b) + if shared > 0: + co_citation.append({ + "bcp_a": bcp_a, + "bcp_b": bcp_b, + "count": shared, + }) + + # Heatmap data: full matrix for all BCPs (top 20 by citation count) + top_bcp_ids = [b["bcp_id"] for b in bcps[:20]] + heatmap_matrix = [] + for bcp_a in top_bcp_ids: + row = [] + drafts_a = set(bcp_citations.get(bcp_a, [])) + for bcp_b in top_bcp_ids: + drafts_b = set(bcp_citations.get(bcp_b, [])) + shared = len(drafts_a & drafts_b) + row.append(shared) + heatmap_matrix.append(row) + + # BCP citations by category + cat_bcp_count: dict[str, Counter] = defaultdict(Counter) + for draft_name, bcp_list in draft_bcps.items(): + cat = draft_cats.get(draft_name, "Other") + for bcp_id in bcp_list: + cat_bcp_count[cat][bcp_id] += 1 + + by_category = [] + for cat in sorted(cat_bcp_count.keys()): + top_bcps = cat_bcp_count[cat].most_common(5) + by_category.append({ + "category": cat, + "total_bcp_refs": sum(cat_bcp_count[cat].values()), + "unique_bcps": len(cat_bcp_count[cat]), + "top_bcps": [{"bcp_id": bid, "count": c} for bid, c in top_bcps], + }) + by_category.sort(key=lambda x: x["total_bcp_refs"], reverse=True) + + # Coverage + drafts_with_bcp = len(draft_bcps) + coverage_pct = (drafts_with_bcp / total_drafts * 100) if total_drafts > 0 else 0 + + return { + "bcps": bcps, + "co_citation": co_citation, + "heatmap_labels": top_bcp_ids, + "heatmap_matrix": heatmap_matrix, + "by_category": by_category, + "coverage": { + "total_drafts": total_drafts, + "drafts_with_bcp": drafts_with_bcp, + "coverage_pct": round(coverage_pct, 1), + "unique_bcps": len(bcp_citations), + "total_bcp_refs": len(bcp_rows), + }, + } diff --git a/src/webui/data/authors.py b/src/webui/data/authors.py new file mode 100644 index 0000000..b6268b9 --- /dev/null +++ b/src/webui/data/authors.py @@ -0,0 +1,276 @@ +"""Author-related data access functions.""" +from __future__ import annotations + +import re +from collections import Counter, defaultdict +from typing import TypedDict + +from ietf_analyzer.db import Database +from webui.data._shared import _cached + + +class AuthorInfo(TypedDict): + """Author entry from :func:`get_top_authors`.""" + name: str + affiliation: str + draft_count: int + drafts: list[str] + +class AuthorNetworkNode(TypedDict): + """Node in the author network graph.""" + id: str + name: str + org: str + draft_count: int + avg_score: float + drafts: list[str] + +class AuthorNetworkEdge(TypedDict): + """Edge in the author network graph.""" + source: str + target: str + weight: int + +class AuthorCluster(TypedDict): + """Cluster in the author network.""" + id: int + members: list[str] + org_mix: dict[str, int] + size: int + drafts: list[dict[str, str]] + draft_count: int + +class AuthorNetwork(TypedDict): + """Full author network from :func:`get_author_network_full`.""" + nodes: list[AuthorNetworkNode] + edges: list[AuthorNetworkEdge] + clusters: list[AuthorCluster] + +def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]: + """Return top authors by draft count.""" + rows = db.top_authors(limit=limit) + return [ + {"name": name, "affiliation": aff, "draft_count": cnt, "drafts": drafts} + for name, aff, cnt, drafts in rows + ] + +def get_org_data(db: Database, limit: int = 20) -> list[dict]: + """Return organization contribution data.""" + rows = db.top_orgs(limit=limit) + return [ + {"org": org, "author_count": authors, "draft_count": drafts} + for org, authors, drafts in rows + ] + +def get_coauthor_network(db: Database, min_shared: int = 1) -> dict: + """Return co-authorship network data for force-directed graph. + + Returns {nodes: [{id, name, org, draft_count}], edges: [{source, target, weight}]} + """ + pairs = db.coauthor_pairs() + top = db.top_authors(limit=100) + + # Build node set from authors who have co-authorships + author_info = {name: {"org": aff, "draft_count": cnt} for name, aff, cnt, _ in top} + node_set = set() + edges = [] + for a, b, shared in pairs: + if shared >= min_shared: + node_set.add(a) + node_set.add(b) + edges.append({"source": a, "target": b, "weight": shared}) + + nodes = [] + for name in node_set: + info = author_info.get(name, {"org": "", "draft_count": 1}) + nodes.append({ + "id": name, + "name": name, + "org": info["org"], + "draft_count": info["draft_count"], + }) + + return {"nodes": nodes, "edges": edges} + +def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]: + """Return cross-org collaboration pairs.""" + rows = db.cross_org_collaborations(limit=limit) + return [ + {"org_a": a, "org_b": b, "shared_drafts": cnt} + for a, b, cnt in rows + ] + +def get_author_network_full(db: Database) -> AuthorNetwork: + """Return author network (cached for 5 min).""" + return _cached("author_network", lambda: _compute_author_network_full(db)) + +def _compute_author_network_full(db: Database) -> AuthorNetwork: + """Return enriched co-authorship network with avg scores and cluster info. + + Returns { + nodes: [{id, name, org, draft_count, avg_score, drafts: [name,...]}], + edges: [{source, target, weight}], + clusters: [{id, members: [name,...], org_mix: {org: count}, size}], + } + """ + pairs = db.coauthor_pairs() + top = db.top_authors(limit=500) + + # Build rating lookup for avg scores + rated = db.drafts_with_ratings(limit=2000) + draft_score = {d.name: r.composite_score for d, r in rated} + + # Author info map + author_info = {} + for name, aff, cnt, drafts in top: + scores = [draft_score[dn] for dn in drafts if dn in draft_score] + avg = round(sum(scores) / len(scores), 2) if scores else 0 + author_info[name] = { + "org": aff, "draft_count": cnt, "drafts": drafts, "avg_score": avg + } + + # Build node set: authors with meaningful collaboration (2+ shared drafts) + node_set = set() + edges = [] + for a, b, shared in pairs: + if shared >= 2: + node_set.add(a) + node_set.add(b) + edges.append({"source": a, "target": b, "weight": shared}) + + # Also include authors with 3+ drafts even if no co-authorships + for name, info in author_info.items(): + if info["draft_count"] >= 3: + node_set.add(name) + + nodes = [] + for name in node_set: + info = author_info.get(name, {"org": "", "draft_count": 1, "drafts": [], "avg_score": 0}) + nodes.append({ + "id": name, + "name": name, + "org": info["org"], + "draft_count": info["draft_count"], + "avg_score": info["avg_score"], + "drafts": info["drafts"][:8], # cap for JSON size + }) + + # Cluster detection via connected components (BFS) + adjacency: dict[str, set[str]] = defaultdict(set) + for e in edges: + adjacency[e["source"]].add(e["target"]) + adjacency[e["target"]].add(e["source"]) + + visited: set[str] = set() + clusters = [] + + # Batch-load all drafts referenced by authors (avoid N+1 in cluster loop) + _all_dn = set() + for _ai in author_info.values(): + _all_dn.update(_ai.get("drafts", [])) + _all_drafts_map = db.get_drafts_by_names(list(_all_dn)) + + for node in sorted(node_set): + if node in visited: + continue + component: list[str] = [] + queue = [node] + while queue: + current = queue.pop(0) + if current in visited: + continue + visited.add(current) + component.append(current) + for neighbor in adjacency.get(current, []): + if neighbor not in visited: + queue.append(neighbor) + + if len(component) >= 2: + org_mix: dict[str, int] = Counter() + member_orgs: dict[str, str] = {} + cluster_drafts: dict[str, str] = {} # name -> title + for m in component: + org = author_info.get(m, {}).get("org", "") + if org: + org_mix[org] += 1 + member_orgs[m] = org + for dn in author_info.get(m, {}).get("drafts", []): + if dn not in cluster_drafts: + d = _all_drafts_map.get(dn) + cluster_drafts[dn] = d.title[:80] if d else dn + clusters.append({ + "id": len(clusters), + "members": component, + "member_orgs": member_orgs, + "org_mix": dict(org_mix.most_common()), + "size": len(component), + "drafts": [{"name": n, "title": t} for n, t in list(cluster_drafts.items())], + "draft_count": len(cluster_drafts), + }) + + clusters.sort(key=lambda c: c["size"], reverse=True) + + # Generate meaningful names for clusters + for cl in clusters: + cl["name"] = _author_cluster_name(cl) + + return {"nodes": nodes, "edges": edges, "clusters": clusters} + +def _normalize_org(name: str) -> str: + """Shorten verbose org names for display.""" + # Remove common suffixes + for suffix in (", Inc.", " Inc.", ", Ltd.", " Ltd.", " Co.", " Technologies", + " Corporation", " Corp.", " Limited", " GmbH", " AG", + " Europe Ltd", " Research", " Systems"): + name = name.replace(suffix, "") + return name.strip().rstrip(",").rstrip("&").rstrip() + +def _author_cluster_name(cluster: dict) -> str: + """Derive a meaningful name for an author cluster from orgs and draft titles.""" + # Org part: top 1-2 orgs, normalized + raw_orgs = list(cluster.get("org_mix", {}).keys()) + orgs = [] + seen_short: set[str] = set() + for o in raw_orgs: + short = _normalize_org(o) + if short.lower() not in seen_short: + seen_short.add(short.lower()) + orgs.append(short) + if len(orgs) >= 2: + org_label = f"{orgs[0]} + {orgs[1]}" + elif orgs: + org_label = orgs[0] + else: + # Fall back to first member's last name + members = cluster.get("members", []) + org_label = members[0].split()[-1] if members else "Unknown" + + # Topic part: extract common keywords from draft titles + stopwords = { + "a", "an", "the", "of", "for", "in", "to", "and", "on", "with", + "using", "based", "draft", "internet", "ietf", "protocol", "framework", + "requirements", "architecture", "considerations", "use", "cases", "via", + "towards", "over", "from", "into", "between", "specification", "extension", + "extensions", "mechanisms", "mechanism", "version", "new", "general", + } + word_counts: Counter = Counter() + for d in cluster.get("drafts", []): + title = d.get("title", "") + words = re.findall(r"[A-Za-z]{3,}", title) + for w in words: + wl = w.lower() + if wl not in stopwords: + word_counts[wl] += 1 + + # Pick top keyword(s) that appear in multiple drafts + top_words = [w for w, c in word_counts.most_common(3) if c >= 2] + if not top_words: + top_words = [w for w, _ in word_counts.most_common(1)] + + if top_words: + topic = " ".join(w.capitalize() for w in top_words[:2]) + name = f"{org_label} — {topic}" + else: + name = org_label + # Truncate if too long for display + return name if len(name) <= 50 else name[:47] + "…" diff --git a/src/webui/data/drafts.py b/src/webui/data/drafts.py new file mode 100644 index 0000000..a5d9e45 --- /dev/null +++ b/src/webui/data/drafts.py @@ -0,0 +1,381 @@ +"""Draft-related data access functions.""" +from __future__ import annotations + +import json +import re +from collections import Counter, defaultdict +from pathlib import Path +from typing import TypedDict + +from ietf_analyzer.db import Database +from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch +from webui.data._shared import _project_root + + +class OverviewStats(TypedDict): + """High-level dashboard statistics from :func:`get_overview_stats`.""" + total_drafts: int + rated_count: int + author_count: int + idea_count: int + gap_count: int + input_tokens: int + output_tokens: int + false_positive_count: int + +class DraftListItem(TypedDict): + """Single draft in the paginated listing from :func:`get_drafts_page`.""" + name: str + title: str + date: str | None + url: str + pages: int + group: str + source: str + score: float + novelty: float + maturity: float + overlap: float + momentum: float + relevance: float + categories: list[str] + summary: str + readiness: float + +class DraftsPage(TypedDict): + """Paginated draft listing from :func:`get_drafts_page`.""" + drafts: list[DraftListItem] + total: int + page: int + per_page: int + pages: int + +def get_overview_stats(db: Database) -> OverviewStats: + """Return high-level stats for the dashboard home page. + + Excludes drafts flagged as false positives from rated counts. + """ + total_drafts = db.count_drafts(include_false_positives=False) + rated_pairs = db.drafts_with_ratings(limit=1000) # already excludes FPs + rated_count = len(rated_pairs) + author_count = db.author_count() + idea_count = db.idea_count() + gaps = db.all_gaps() + input_tok, output_tok = db.total_tokens_used() + + # Count false positives separately for transparency + total_all = db.count_drafts(include_false_positives=True) + false_positive_count = total_all - total_drafts + + return { + "total_drafts": total_drafts, + "rated_count": rated_count, + "author_count": author_count, + "idea_count": idea_count, + "gap_count": len(gaps), + "input_tokens": input_tok, + "output_tokens": output_tok, + "false_positive_count": false_positive_count, + } + +def get_category_counts(db: Database) -> dict[str, int]: + """Return {category: draft_count} for all categories.""" + return db.category_counts() + +def get_category_summary(db: Database, category: str) -> dict | None: + """Build a data-driven summary for a category. Returns None if category not found.""" + pairs = db.drafts_with_ratings(limit=2000) + all_authors = db.top_authors(limit=500) + + # Filter to drafts in this category + cat_pairs = [(d, r) for d, r in pairs if category in r.categories] + if not cat_pairs: + return None + + # Author lookup: draft_name -> [author names] + author_drafts_map: dict[str, list[str]] = defaultdict(list) + for name, aff, cnt, drafts in all_authors: + for dn in drafts: + author_drafts_map[dn].append(name) + + # Dimension averages + n = len(cat_pairs) + avg = lambda vals: round(sum(vals) / len(vals), 1) if vals else 0 + novelty_vals = [r.novelty for _, r in cat_pairs] + maturity_vals = [r.maturity for _, r in cat_pairs] + overlap_vals = [r.overlap for _, r in cat_pairs] + momentum_vals = [r.momentum for _, r in cat_pairs] + relevance_vals = [r.relevance for _, r in cat_pairs] + scores = [r.composite_score for _, r in cat_pairs] + + # Top drafts + sorted_pairs = sorted(cat_pairs, key=lambda p: p[1].composite_score, reverse=True) + top_3 = [(d.name, d.title, round(r.composite_score, 1)) for d, r in sorted_pairs[:3]] + + # Top authors in this category + author_counter: Counter = Counter() + org_counter: Counter = Counter() + author_aff: dict[str, str] = {} + for name, aff, cnt, drafts in all_authors: + author_aff[name] = aff or "" + for d, r in cat_pairs: + for a in author_drafts_map.get(d.name, []): + author_counter[a] += 1 + if author_aff.get(a): + org_counter[author_aff[a]] += 1 + top_authors = author_counter.most_common(5) + top_orgs = org_counter.most_common(5) + + # Strongest and weakest dimensions + dim_avgs = { + "Novelty": avg(novelty_vals), + "Maturity": avg(maturity_vals), + "Overlap": avg(overlap_vals), + "Momentum": avg(momentum_vals), + "Relevance": avg(relevance_vals), + } + strongest = max(dim_avgs, key=dim_avgs.get) + weakest = min(dim_avgs, key=dim_avgs.get) + + # Activity trend: how many are recent (last 6 months)? + recent = sum(1 for d, _ in cat_pairs if d.time and d.time >= "2025-09") + total_all = len(pairs) + + # Build text summary + lines = [] + lines.append(f"**{n} drafts** ({n * 100 // total_all}% of all rated drafts) " + f"with an average composite score of **{avg(scores):.1f}/5.0**.") + + # Dimension profile + lines.append(f"Strongest dimension: **{strongest}** ({dim_avgs[strongest]}), " + f"weakest: **{weakest}** ({dim_avgs[weakest]}).") + + # Maturity vs novelty insight + if dim_avgs["Maturity"] < 2.5 and dim_avgs["Novelty"] >= 3.0: + lines.append("This category has **high novelty but low maturity** — many early-stage proposals with fresh ideas that haven't been fully developed yet.") + elif dim_avgs["Maturity"] >= 3.0 and dim_avgs["Novelty"] < 2.5: + lines.append("This category is **mature but less novel** — established approaches being refined rather than introducing fundamentally new concepts.") + elif dim_avgs["Maturity"] >= 3.0 and dim_avgs["Novelty"] >= 3.0: + lines.append("This category shows **both high novelty and maturity** — well-developed proposals with genuinely new contributions.") + + # Overlap insight + if dim_avgs["Overlap"] >= 3.5: + lines.append(f"High overlap ({dim_avgs['Overlap']}) suggests **significant duplication** — multiple drafts cover similar ground, which may indicate convergence or fragmentation.") + elif dim_avgs["Overlap"] <= 2.0: + lines.append(f"Low overlap ({dim_avgs['Overlap']}) indicates **diverse approaches** — drafts in this category tackle distinct problems with little redundancy.") + + # Activity + if recent > 0: + lines.append(f"**{recent} draft{'s' if recent != 1 else ''}** submitted in the last 6 months, " + f"suggesting {'active' if recent >= 3 else 'moderate'} development.") + + return { + "text": " ".join(lines), + "count": n, + "avg_score": avg(scores), + "dimensions": dim_avgs, + "top_drafts": top_3, + "top_authors": top_authors, + "top_orgs": top_orgs, + "strongest": strongest, + "weakest": weakest, + } + +def get_drafts_page( + db: Database, + page: int = 1, + per_page: int = 50, + search: str = "", + category: str = "", + min_score: float = 0.0, + sort: str = "score", + sort_dir: str = "desc", + source: str = "", +) -> DraftsPage: + """Return a paginated, filtered list of drafts with ratings. + + Returns dict with keys: drafts, total, page, per_page, pages. + """ + pairs = db.drafts_with_ratings(limit=1000) + + # Build author lookup for search (draft_name -> "author1 author2 ...") + author_text_by_draft: dict[str, str] = {} + if search: + rows = db.conn.execute( + """SELECT da.draft_name, GROUP_CONCAT(a.name, ' ') as names + FROM draft_authors da JOIN authors a ON da.person_id = a.person_id + GROUP BY da.draft_name""" + ).fetchall() + for r in rows: + author_text_by_draft[r[0]] = r[1] or "" + + # Filter + filtered = [] + for draft, rating in pairs: + if min_score > 0 and rating.composite_score < min_score: + continue + if category and category not in rating.categories: + continue + if source and draft.source != source: + continue + if search: + author_names = author_text_by_draft.get(draft.name, "") + haystack = f"{draft.name} {draft.title} {rating.summary} {author_names}".lower() + if not all(w in haystack for w in search.lower().split()): + continue + filtered.append((draft, rating)) + + # Sort + sort_keys = { + "score": lambda p: p[1].composite_score, + "name": lambda p: p[0].name, + "date": lambda p: p[0].time or "", + "novelty": lambda p: p[1].novelty, + "maturity": lambda p: p[1].maturity, + "relevance": lambda p: p[1].relevance, + "overlap": lambda p: p[1].overlap, + "momentum": lambda p: p[1].momentum, + "readiness": lambda p: (1.0 if p[0].name.startswith("draft-ietf-") else 0.0) * 0.25 + + min(int(p[0].rev or "0") / 5.0, 1.0) * 0.15 + + ((p[1].momentum - 1) / 4.0) * 0.15, + } + key_fn = sort_keys.get(sort, sort_keys["score"]) + reverse = sort_dir == "desc" + filtered.sort(key=key_fn, reverse=reverse) + + total = len(filtered) + pages = max(1, (total + per_page - 1) // per_page) + page = max(1, min(page, pages)) + start = (page - 1) * per_page + page_items = filtered[start : start + per_page] + + # Pre-compute readiness in batch (~6 queries total instead of ~200) + + readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items]) + + drafts = [] + for draft, rating in page_items: + r_score = readiness_cache.get(draft.name, {}).get("score", 0) + drafts.append({ + "name": draft.name, + "title": draft.title, + "date": draft.date, + "url": draft.source_url if draft.source != "ietf" else draft.datatracker_url, + "pages": draft.pages or 0, + "group": draft.group or "individual", + "source": draft.source or "ietf", + "score": round(rating.composite_score, 2), + "novelty": rating.novelty, + "maturity": rating.maturity, + "overlap": rating.overlap, + "momentum": rating.momentum, + "relevance": rating.relevance, + "categories": rating.categories, + "summary": rating.summary, + "readiness": r_score, + }) + + return { + "drafts": drafts, + "total": total, + "page": page, + "per_page": per_page, + "pages": pages, + } + +def get_draft_detail(db: Database, name: str) -> dict | None: + """Return full detail for a single draft.""" + draft = db.get_draft(name) + if not draft: + return None + + rating = db.get_rating(name) + authors = db.get_authors_for_draft(name) + ideas = db.get_ideas_for_draft(name) + refs = db.get_refs_for_draft(name) + + result = { + "name": draft.name, + "title": draft.title, + "rev": draft.rev, + "abstract": draft.abstract, + "date": draft.date, + "time": draft.time, + "url": draft.datatracker_url, + "text_url": draft.text_url, + "pages": draft.pages, + "words": draft.words, + "group": draft.group or "individual", + "categories": draft.categories, + "tags": draft.tags, + "authors": [ + {"name": a.name, "affiliation": a.affiliation, "person_id": a.person_id} + for a in authors + ], + "ideas": ideas, + "refs": [{"type": t, "id": rid} for t, rid in refs], + } + + if rating: + result["rating"] = { + "score": round(rating.composite_score, 2), + "novelty": rating.novelty, + "maturity": rating.maturity, + "overlap": rating.overlap, + "momentum": rating.momentum, + "relevance": rating.relevance, + "summary": rating.summary, + "novelty_note": rating.novelty_note, + "maturity_note": rating.maturity_note, + "overlap_note": rating.overlap_note, + "momentum_note": rating.momentum_note, + "relevance_note": rating.relevance_note, + "categories": rating.categories, + } + + # Readiness score + + result["readiness"] = compute_readiness(db, name) + + # Annotation + annotation = db.get_annotation(name) + result["annotation"] = annotation + + return result + +def get_generated_drafts() -> list[dict]: + """Return list of pre-generated draft files in data/reports/generated-drafts/.""" + drafts_dir = _project_root / "data" / "reports" / "generated-drafts" + if not drafts_dir.exists(): + return [] + results = [] + for f in sorted(drafts_dir.glob("draft-*.txt")): + # Extract title from first non-empty content line after header + title = f.stem + text = f.read_text(errors="replace") + for line in text.splitlines(): + stripped = line.strip() + if stripped and not stripped.startswith("Internet-Draft") and \ + not stripped.startswith("Intended status") and \ + not stripped.startswith("Expires:") and stripped != "": + title = stripped + break + results.append({ + "filename": f.name, + "stem": f.stem, + "title": title, + "size": f.stat().st_size, + "path": str(f), + }) + return results + +def read_generated_draft(filename: str) -> str | None: + """Read a generated draft file by filename. Returns text or None.""" + drafts_dir = _project_root / "data" / "reports" / "generated-drafts" + path = drafts_dir / filename + if not path.exists() or not path.is_file(): + return None + # Safety: ensure we're not reading outside the directory + if not str(path.resolve()).startswith(str(drafts_dir.resolve())): + return None + return path.read_text(errors="replace") diff --git a/src/webui/data/gaps.py b/src/webui/data/gaps.py new file mode 100644 index 0000000..83b9b05 --- /dev/null +++ b/src/webui/data/gaps.py @@ -0,0 +1,20 @@ +"""Gap analysis data access functions.""" +from __future__ import annotations + +from ietf_analyzer.db import Database + + +def get_all_gaps(db: Database) -> list[dict]: + """Return all gap analysis results, sorted by severity (critical first).""" + _sev_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} + gaps = db.all_gaps() + gaps.sort(key=lambda g: _sev_order.get(g.get("severity", "low"), 99)) + return gaps + +def get_gap_detail(db: Database, gap_id: int) -> dict | None: + """Return a single gap by ID, or None if not found.""" + gaps = db.all_gaps() + for g in gaps: + if g["id"] == gap_id: + return g + return None diff --git a/src/webui/data/proposals.py b/src/webui/data/proposals.py new file mode 100644 index 0000000..e9a26f9 --- /dev/null +++ b/src/webui/data/proposals.py @@ -0,0 +1,26 @@ +"""Proposal data access functions.""" +from __future__ import annotations + +from ietf_analyzer.db import Database + + +def get_all_proposals(db: Database) -> list[dict]: + """Return all proposals with linked gap info.""" + proposals = db.all_proposals() + gaps = {g["id"]: g for g in db.all_gaps()} + for p in proposals: + p["gaps"] = [gaps[gid] for gid in p.get("gap_ids", []) if gid in gaps] + return proposals + +def get_proposal_detail(db: Database, proposal_id: int) -> dict | None: + """Return a single proposal with full gap details.""" + p = db.get_proposal(proposal_id) + if not p: + return None + gaps = {g["id"]: g for g in db.all_gaps()} + p["gaps"] = [gaps[gid] for gid in p.get("gap_ids", []) if gid in gaps] + return p + +def get_proposals_for_gap(db: Database, gap_id: int) -> list[dict]: + """Return proposals linked to a specific gap.""" + return db.get_proposals_for_gap(gap_id) diff --git a/src/webui/data/ratings.py b/src/webui/data/ratings.py new file mode 100644 index 0000000..172954e --- /dev/null +++ b/src/webui/data/ratings.py @@ -0,0 +1,155 @@ +"""Rating-related data access functions.""" +from __future__ import annotations + +import json +from collections import Counter, defaultdict + +from ietf_analyzer.db import Database + + +def get_rating_distributions(db: Database) -> dict: + """Return arrays for each rating dimension, suitable for Plotly.""" + pairs = db.drafts_with_ratings(limit=1000) + dims = { + "novelty": [], + "maturity": [], + "overlap": [], + "momentum": [], + "relevance": [], + "scores": [], + "categories": [], + "names": [], + "sources": [], + } + for draft, rating in pairs: + dims["novelty"].append(rating.novelty) + dims["maturity"].append(rating.maturity) + dims["overlap"].append(rating.overlap) + dims["momentum"].append(rating.momentum) + dims["relevance"].append(rating.relevance) + dims["scores"].append(round(rating.composite_score, 2)) + dims["categories"].append(rating.categories[0] if rating.categories else "Other") + dims["names"].append(draft.name) + dims["sources"].append(getattr(draft, "source", "ietf") or "ietf") + return dims + +def get_category_radar_data(db: Database) -> dict: + """Return average rating profiles per category for radar chart.""" + pairs = db.drafts_with_ratings(limit=1000) + cat_ratings: dict[str, list] = defaultdict(list) + for _, r in pairs: + for c in r.categories: + cat_ratings[c].append(r) + + top_cats = sorted(cat_ratings.keys(), key=lambda c: len(cat_ratings[c]), reverse=True)[:8] + result = {} + for cat in top_cats: + ratings = cat_ratings[cat] + n = len(ratings) + result[cat] = { + "count": n, + "novelty": round(sum(r.novelty for r in ratings) / n, 2), + "maturity": round(sum(r.maturity for r in ratings) / n, 2), + "relevance": round(sum(r.relevance for r in ratings) / n, 2), + "momentum": round(sum(r.momentum for r in ratings) / n, 2), + "low_overlap": round(sum(6 - r.overlap for r in ratings) / n, 2), + } + return result + +def get_score_histogram(db: Database) -> list[float]: + """Return list of composite scores for histogram.""" + pairs = db.drafts_with_ratings(limit=1000) + return [round(r.composite_score, 2) for _, r in pairs] + +def get_false_positive_profile(db: Database) -> dict: + """Profile drafts flagged as false positives.""" + # Get false positives + fp_rows = db.false_positive_drafts_raw() + + # Get non-FP rated drafts for comparison + nonfp_rows = db.non_false_positive_ratings_raw() + + total_rated = db.rated_count() + total_drafts = db.count_drafts(include_false_positives=True) + + # Build FP list + fp_list = [] + fp_categories: Counter = Counter() + fp_sources: Counter = Counter() + fp_dims = {"novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": []} + + for row in fp_rows: + cats = json.loads(row["r_categories"]) if row["r_categories"] else [] + src = row["source"] or "ietf" + fp_list.append({ + "name": row["name"], + "title": row["title"], + "source": src, + "categories": cats, + "relevance": row["relevance"], + "novelty": row["novelty"], + "maturity": row["maturity"], + "overlap": row["overlap"], + "momentum": row["momentum"], + "summary": row["summary"] or "", + }) + for cat in cats: + fp_categories[cat] += 1 + fp_sources[src] += 1 + fp_dims["novelty"].append(row["novelty"]) + fp_dims["maturity"].append(row["maturity"]) + fp_dims["overlap"].append(row["overlap"]) + fp_dims["momentum"].append(row["momentum"]) + fp_dims["relevance"].append(row["relevance"]) + + # Non-FP dimensions for comparison + nonfp_dims = {"novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": []} + nonfp_categories: Counter = Counter() + for row in nonfp_rows: + nonfp_dims["novelty"].append(row["novelty"]) + nonfp_dims["maturity"].append(row["maturity"]) + nonfp_dims["overlap"].append(row["overlap"]) + nonfp_dims["momentum"].append(row["momentum"]) + nonfp_dims["relevance"].append(row["relevance"]) + cats = json.loads(row["r_categories"]) if row["r_categories"] else [] + for cat in cats: + nonfp_categories[cat] += 1 + + # Top terms from FP abstracts + from collections import Counter as _Counter + stop_words = { + "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", + "of", "with", "by", "from", "is", "it", "that", "this", "are", "was", + "be", "as", "can", "may", "will", "not", "has", "have", "been", "which", + "their", "its", "also", "such", "these", "would", "should", "could", + "more", "other", "than", "into", "about", "between", "over", "after", + "all", "one", "two", "new", "they", "we", "our", "each", "some", "any", + "there", "what", "when", "how", "where", "who", "does", "do", "did", + "no", "if", "so", "up", "out", "only", "used", "using", "use", "based", + "through", "both", "well", "within", "must", "while", "had", "were", + } + word_counter: Counter = Counter() + for row in fp_rows: + abstract = (row["abstract"] or "").lower() + title = (row["title"] or "").lower() + text = abstract + " " + title + words = re.findall(r'[a-z]{3,}', text) + for w in words: + if w not in stop_words: + word_counter[w] += 1 + top_terms = word_counter.most_common(30) + + return { + "count": len(fp_list), + "total_rated": total_rated, + "total_drafts": total_drafts, + "pct_of_total": round(100 * len(fp_list) / total_drafts, 1) if total_drafts else 0, + "pct_of_rated": round(100 * len(fp_list) / total_rated, 1) if total_rated else 0, + "fp_list": fp_list, + "fp_categories": dict(fp_categories.most_common()), + "fp_sources": dict(fp_sources.most_common()), + "fp_dims": fp_dims, + "nonfp_dims": nonfp_dims, + "top_terms": top_terms, + "nonfp_categories": dict(nonfp_categories.most_common(20)), + } diff --git a/src/webui/data/search.py b/src/webui/data/search.py new file mode 100644 index 0000000..af744b7 --- /dev/null +++ b/src/webui/data/search.py @@ -0,0 +1,107 @@ +"""Search and Q&A data access functions.""" +from __future__ import annotations + +import re +from typing import TypedDict + +from ietf_analyzer.config import Config +from ietf_analyzer.db import Database +from ietf_analyzer.search import HybridSearch + + +class SearchResults(TypedDict): + """Global search results from :func:`global_search`.""" + drafts: list[dict] + ideas: list[dict] + authors: list[dict] + gaps: list[dict] + +def global_search(db: Database, query: str) -> SearchResults: + """Search across drafts (FTS5), ideas, authors, and gaps. + + Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}. + """ + results: dict = {"drafts": [], "ideas": [], "authors": [], "gaps": []} + if not query or not query.strip(): + return results + + q = query.strip() + + # 1. Drafts via FTS5 + try: + fts_query = re.sub(r'[^\w\s]', '', q) + fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE) + fts_query = re.sub(r'\s+', ' ', fts_query).strip() + if not fts_query: + raise ValueError("empty query after sanitization") + rows = db.conn.execute( + """SELECT d.name, d.title, d.abstract, d.time, d."group" + FROM drafts d + JOIN drafts_fts f ON d.rowid = f.rowid + WHERE drafts_fts MATCH ? + ORDER BY rank + LIMIT 50""", + (fts_query,), + ).fetchall() + for r in rows: + results["drafts"].append({ + "name": r["name"], + "title": r["title"], + "abstract": (r["abstract"] or "")[:200], + "date": r["time"], + "group": r["group"] or "individual", + }) + except Exception: + # FTS5 match can fail on certain query syntax; fall back to LIKE + like = f"%{q}%" + rows = db.conn.execute( + """SELECT name, title, abstract, time, "group" FROM drafts + WHERE title LIKE ? OR name LIKE ? OR abstract LIKE ? + LIMIT 50""", + (like, like, like), + ).fetchall() + for r in rows: + results["drafts"].append({ + "name": r["name"], + "title": r["title"], + "abstract": (r["abstract"] or "")[:200], + "date": r["time"], + "group": r["group"] or "individual", + }) + + # 2. Ideas via LIKE + like = f"%{q}%" + rows = db.conn.execute( + """SELECT id, title, description, idea_type, draft_name FROM ideas + WHERE title LIKE ? OR description LIKE ? + ORDER BY id LIMIT 50""", + (like, like), + ).fetchall() + for r in rows: + results["ideas"].append({ + "id": r["id"], + "title": r["title"], + "description": (r["description"] or "")[:200], + "type": r["idea_type"], + "draft_name": r["draft_name"], + }) + + # 3. Authors via LIKE + results["authors"] = db.search_authors(q, limit=50) + + # 4. Gaps via LIKE + results["gaps"] = db.search_gaps(q, limit=50) + + return results + +def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: + """Search-only (free) — returns sources + cached answer if available.""" + config = Config.load() + searcher = HybridSearch(config, db) + return searcher.search_only(question, top_k=top_k) + +def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict: + """Run Claude synthesis (costs tokens, result is cached permanently).""" + config = Config.load() + searcher = HybridSearch(config, db) + return searcher.ask(question, top_k=top_k, cheap=cheap) diff --git a/src/webui/templates/errors/404.html b/src/webui/templates/errors/404.html new file mode 100644 index 0000000..8ca19ee --- /dev/null +++ b/src/webui/templates/errors/404.html @@ -0,0 +1,23 @@ +{% extends "base.html" %} + +{% block title %}404 — Not Found{% endblock %} + +{% block content %} +
+
+

404

+

Page Not Found

+

+ The page you're looking for doesn't exist or has been moved. +

+ +
+
+{% endblock %} diff --git a/src/webui/templates/errors/500.html b/src/webui/templates/errors/500.html new file mode 100644 index 0000000..59bb879 --- /dev/null +++ b/src/webui/templates/errors/500.html @@ -0,0 +1,20 @@ +{% extends "base.html" %} + +{% block title %}500 — Server Error{% endblock %} + +{% block content %} +
+
+

500

+

Internal Server Error

+

+ Something went wrong on our end. Please try again later. +

+ +
+
+{% endblock %}