diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..dd49567 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,26 @@ +name: CI +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: pip install -e ".[test]" + - name: Lint with ruff + run: | + pip install ruff + ruff check src/ tests/ --select E,F,W --ignore E501 + - name: Run tests + run: pytest tests/ -v --tb=short diff --git a/data/reports/api-docs.md b/data/reports/api-docs.md new file mode 100644 index 0000000..6422422 --- /dev/null +++ b/data/reports/api-docs.md @@ -0,0 +1,359 @@ +# IETF Draft Analyzer — API Documentation + +All API endpoints return JSON by default. Several support `?format=csv` for CSV export. + +Base URL: `http://localhost:5000` + +--- + +## Public Endpoints + +### GET /api/stats + +Overview statistics for the entire corpus. + +**Parameters:** None + +**Response:** +```json +{ + "total_drafts": 361, + "rated_drafts": 260, + "total_authors": 403, + "total_ideas": 1262, + "total_gaps": 12, + "avg_score": 3.42 +} +``` + +--- + +### GET /api/drafts + +Paginated, filterable list of drafts. + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `page` | int | 1 | Page number | +| `q` | string | "" | Full-text search query | +| `cat` | string | "" | Filter by category | +| `source` | string | "" | Filter by source (ietf, w3c) | +| `min_score` | float | 0.0 | Minimum composite score | +| `sort` | string | "score" | Sort field | +| `dir` | string | "desc" | Sort direction (asc/desc) | +| `format` | string | "json" | Response format: "json" or "csv" | + +**Response:** JSON object with `drafts` array and pagination metadata. + +--- + +### GET /api/drafts/{name} + +Detail for a single draft including rating, authors, ideas, and references. + +**Parameters:** +| Param | Type | Description | +|-------|------|-------------| +| `name` | string | Draft name, e.g. `draft-ietf-ai-agent-protocol` | + +**Response:** JSON object with full draft detail, or `{"error": "Draft not found"}` (404). + +--- + +### GET /api/categories + +Category names and draft counts. + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `format` | string | "json" | "json" or "csv" | + +**Response:** +```json +{ + "A2A protocols": 45, + "AI safety/alignment": 38, + ... +} +``` + +--- + +### GET /api/ratings + +Rating distributions across the corpus. + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `format` | string | "json" | "json" or "csv" | + +**Response:** JSON object with arrays: `names`, `scores`, `novelty`, `maturity`, `overlap`, `momentum`, `relevance`, `categories`. + +--- + +### GET /api/timeline + +Timeline data showing draft publication over time. + +**Parameters:** None + +**Response:** JSON object with timeline series data. + +--- + +### GET /api/landscape + +t-SNE 2D embedding landscape of all drafts. + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `format` | string | "json" | "json" or "csv" | + +**Response:** JSON array of `{name, x, y, category, score}` points. + +--- + +### GET /api/similarity + +Draft similarity network graph. + +**Parameters:** None + +**Response:** JSON object with `nodes` and `edges` arrays for a force-directed graph. + +--- + +### GET /api/idea-clusters + +Clustered ideas across drafts. + +**Parameters:** None + +**Response:** JSON object with cluster data. + +--- + +### GET /api/ideas + +All extracted technical ideas, grouped by type. + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `format` | string | "json" | "json" or "csv" | + +**Response:** JSON object with `ideas` array. + +--- + +### GET /api/authors/network + +Author collaboration network graph. + +**Parameters:** None + +**Response:** JSON object with `nodes` and `edges` arrays. + +--- + +### GET /api/citations + +Citation/reference graph between drafts. + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `min_refs` | int | 2 | Minimum references to include a node | + +**Response:** JSON object with citation graph data. + +--- + +### GET /api/search + +Global search across drafts, ideas, authors, and gaps. + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `q` | string | "" | Search query (required for results) | +| `format` | string | "json" | "json" or "csv" | + +**Response:** +```json +{ + "drafts": [...], + "ideas": [...], + "authors": [...], + "gaps": [...] +} +``` + +--- + +### POST /api/ask + +Search-only question answering (free, no Claude API call). Returns relevant sources and any cached answer. + +**Request body:** +```json +{ + "question": "What drafts address agent authentication?", + "top_k": 5 +} +``` + +**Response:** JSON with `sources` array and optional cached `answer`. + +--- + +## Admin-Only Endpoints + +These endpoints require admin mode (`--dev` flag) or authentication. + +### POST /api/ask/synthesize + +Synthesize an answer using Claude (costs tokens, rate-limited to 10 req/min/IP). Answers are cached permanently. + +**Auth:** Admin required + +**Request body:** +```json +{ + "question": "How do IETF drafts approach agent identity?", + "top_k": 5 +} +``` + +**Response:** JSON with `sources` array and synthesized `answer`. + +**Errors:** 429 if rate-limited. + +--- + +### GET /api/gaps + +All identified standardization gaps. + +**Auth:** Admin required + +**Parameters:** +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `format` | string | "json" | "json" or "csv" | + +**Response:** JSON array of gap objects. + +--- + +### GET /api/gaps/{gap_id} + +Detail for a single gap. + +**Auth:** Admin required + +**Parameters:** +| Param | Type | Description | +|-------|------|-------------| +| `gap_id` | int | Gap ID | + +**Response:** JSON object with gap detail, or `{"error": "Gap not found"}` (404). + +--- + +### POST /api/compare + +Compare multiple drafts using Claude (costs tokens, rate-limited). + +**Auth:** Admin required + +**Request body:** +```json +{ + "drafts": ["draft-name-one", "draft-name-two"] +} +``` + +**Response:** +```json +{ + "text": "Comparison analysis text...", + "drafts": ["draft-name-one", "draft-name-two"] +} +``` + +**Errors:** 400 if fewer than 2 drafts provided. + +--- + +### POST /api/drafts/{name}/annotate + +Add or update annotations (notes, tags) for a draft. + +**Auth:** Admin required + +**Request body:** +```json +{ + "note": "Interesting approach to agent handshake", + "tags": ["important", "review"], + "add_tag": "flagged", + "remove_tag": "review" +} +``` + +All fields are optional. `add_tag`/`remove_tag` operate on existing tags incrementally. + +**Response:** +```json +{ + "success": true, + "annotation": {"note": "...", "tags": ["important", "flagged"]} +} +``` + +--- + +### GET /api/monitor + +Pipeline monitoring status (processing progress, error counts). + +**Auth:** Admin required + +**Response:** JSON object with monitoring data. + +--- + +## Non-API Data Endpoints + +### GET /export/obsidian + +Download the entire research corpus as an Obsidian vault ZIP file. + +**Response:** `application/zip` file download. + +--- + +## Authentication + +- **Production mode** (default): Admin endpoints return 403. +- **Development mode** (`--dev` flag): All admin endpoints are accessible without authentication. +- Rate-limited endpoints (`/api/ask/synthesize`, `/api/compare`): 10 requests per minute per IP, enforced via in-memory sliding window. + +## Error Responses + +All errors return JSON: +```json +{"error": "Description of the error"} +``` + +Common HTTP status codes: +- `400` — Bad request (missing parameters) +- `403` — Admin access required +- `404` — Resource not found +- `429` — Rate limit exceeded +- `500` — Internal server error diff --git a/data/reports/blog-series/04-what-nobody-builds.md b/data/reports/blog-series/04-what-nobody-builds.md index bde7f6a..9ff6b39 100644 --- a/data/reports/blog-series/04-what-nobody-builds.md +++ b/data/reports/blog-series/04-what-nobody-builds.md @@ -58,7 +58,7 @@ A notable omission from this gap list: **GDPR-mandated capabilities**. The gap a **What is missing**: Circuit breakers for cascading failures. Checkpoint and rollback protocols. Blast radius containment. Graceful degradation. All concepts well-established in distributed systems engineering, but absent from the agent standards landscape. -**The scenario**: A telecom operator deploys 50 AI agents for network monitoring, troubleshooting, and optimization. During a major outage, all 50 agents simultaneously request inference resources to diagnose the problem. With no failure cascade prevention, agents compete chaotically. The most aggressive agents get resources; the most important diagnostic tasks may not. The outage extends because the agents that could fix it are starved by the agents that are observing it. +**The scenario**: A telecom operator deploys 50 AI agents for network monitoring, troubleshooting, and optimization. During a major outage, all 50 agents simultaneously request inference resources to diagnose the problem. With no failure cascade prevention, agents compete chaotically. The most aggressive agents get resources; the most important diagnostic tasks may not. The outage extends because the agents that could fix it are starved by the agents that are observing it. For telecom operators in the EU, the NIS2 Directive (Directive 2022/2555) classifies electronic communications as an essential service, requiring incident response capabilities and supply chain security measures -- making cascade prevention not just an engineering problem but a regulatory obligation. ## High Gap: Real-Time Agent Rollback Mechanisms @@ -90,7 +90,7 @@ An agent operating across multiple domains or organizations needs to maintain au ### Federated Agent Learning Privacy -While federated architectures exist, there is insufficient specification for privacy-preserving agent learning that prevents data leakage between federated participants during model updates. +While federated architectures exist, there is insufficient specification for privacy-preserving agent learning that prevents data leakage between federated participants during model updates. The absence of secure update mechanisms also intersects with the EU Cyber Resilience Act (Regulation 2024/2847), which requires products with digital elements -- including AI agent software -- to handle updates securely and provide vulnerability management throughout their lifecycle. ### Cross-Protocol Agent Migration diff --git a/data/reports/blog-series/06-big-picture.md b/data/reports/blog-series/06-big-picture.md index 5e2090e..23883f5 100644 --- a/data/reports/blog-series/06-big-picture.md +++ b/data/reports/blog-series/06-big-picture.md @@ -77,7 +77,7 @@ The architecture achieves this with *assurance profiles* -- named configurations | L2 | Signed ECTs (JWT) | Cross-org, standard compliance | | L3 | Signed ECTs + external audit ledger | Regulated industries | -This dual-regime approach resolves the tension between "move fast" deployments and "prove everything" regulated environments. Ideas touching behavior verification and data provenance become implementable at higher assurance levels without imposing their cost on every deployment. +This dual-regime approach resolves the tension between "move fast" deployments and "prove everything" regulated environments. Ideas touching behavior verification and data provenance become implementable at higher assurance levels without imposing their cost on every deployment. Notably, the L2 and L3 profiles map directly to the conformity assessment requirements of the EU AI Act (Art. 43): high-risk AI systems must demonstrate compliance through either internal control (L2's signed ECTs) or third-party audit (L3's external audit ledger), making assurance profiles not just an engineering convenience but a regulatory implementation pathway. ## How It Builds on What Exists @@ -123,6 +123,14 @@ Based on the data trajectories and current momentum: **The risk**: If the architecture work does not happen in the next 12 months, the agent ecosystem will calcify around vendor-specific protocol stacks (OpenAI's, Google's, Anthropic's, Huawei's). Each will have its own auth, discovery, and communication layer. The interoperability window will close, and the IETF's work will be standards for islands rather than standards for the internet. +### The Ethics of Standardizing Early + +There is a harder question underneath the technical one: should the IETF be standardizing agent capabilities at all before safety frameworks are mature? The 4:1 capability-to-safety ratio is not just a gap -- it is a policy choice being made by default. Every A2A protocol that ships without behavior verification baked in creates a deployed base that resists retrofitting. The standards community is building the defaults that will govern billions of agent interactions, and those defaults currently assume trust rather than requiring proof. + +The structural dynamics make this worse. The authorship analysis from Post 2 showed that a small number of large organizations -- Huawei, China Mobile, Cisco -- drive a disproportionate share of submissions. Civil society organizations, academic safety researchers, and smaller companies are largely absent from the drafting process. Standards that define agent identity, discovery, and communication also define what can be monitored, audited, and controlled. An agent discovery protocol designed primarily for enterprise deployment efficiency may inadvertently create a surveillance-friendly architecture if privacy and human autonomy are not first-class design constraints. The EU AI Act mandates human oversight (Art. 14), but a mandate is only as good as the protocol that implements it. + +The IETF has historically been good at building infrastructure that serves everyone -- the end-to-end principle, protocol layering, rough consensus. But "rough consensus" among the current participants may not represent the interests of those most affected by autonomous agent systems. The architecture proposed above includes human-in-the-loop as a pillar, not an option. That is the right instinct. The question is whether the community will treat it with the same urgency as the protocol work -- or whether, as the data currently suggests, it will remain an aspiration while the highways ship without traffic lights. + ### Two Equilibria By 2028, the landscape will have resolved into one of two stable states. diff --git a/data/reports/dev-journal.md b/data/reports/dev-journal.md index 476eb1b..43ce6e6 100644 --- a/data/reports/dev-journal.md +++ b/data/reports/dev-journal.md @@ -4,6 +4,53 @@ --- +### 2026-03-08 CODER — TypedDicts for data layer, ethics + regulatory content in blog series + +**What**: Four improvements across typing and content: +1. **TypedDicts in `src/webui/data.py`** — Added 16 TypedDict definitions for common return shapes: `OverviewStats`, `DraftsPage`, `DraftListItem`, `AuthorInfo`, `AuthorNetwork` (with `AuthorNetworkNode`, `AuthorNetworkEdge`, `AuthorCluster`), `SimilarityGraph`, `TimelineData`, `MonitorStatus` (with `MonitorPipeline`, `MonitorCost`), `SearchResults`, `CitationGraph`. Annotated 12 function return types. +2. **Ethics section in Post 06** — Added "The Ethics of Standardizing Early" section (3 paragraphs) covering: premature capability standardization, power asymmetry in authorship, surveillance-friendly architecture risk, and human oversight as non-optional. +3. **EU AI Act conformity assessment note in Post 06** — Connected L2/L3 assurance profiles to Art. 43 conformity assessment requirements (1 sentence in Pillar 4 section). +4. **NIS2 + CRA references in Post 04** — Added NIS2 Directive reference to telecom cascade scenario (essential service obligations). Added Cyber Resilience Act reference to federated learning privacy gap (secure update lifecycle requirements). +**Why**: Untyped dicts make the data layer hard to maintain and refactor. Blog series lacked ethical framing and key EU regulatory cross-references (NIS2, CRA) that strengthen the compliance narrative. +**Result**: 16 TypedDicts with 12 annotated functions. 3 blog post sections added/expanded across Posts 04 and 06. + +--- + +### 2026-03-08 CODER — CI/CD, API docs, and test coverage expansion + +**What**: Three infrastructure additions: +1. **GitHub Actions CI** — Added `.github/workflows/ci.yml` that runs on push/PR to main. Tests Python 3.11 and 3.12, installs from `[test]` extras, runs ruff lint (E/F/W rules, ignoring E501), and runs pytest. +2. **API documentation** — Created `data/reports/api-docs.md` documenting all 20 API endpoints in `src/webui/app.py` with method, URL, parameters, response format, and auth requirements. Covers public endpoints (drafts, stats, search, ideas, ratings, etc.) and admin-only endpoints (gaps, compare, synthesize, annotate, monitor). +3. **New test files** — Added `tests/test_analyzer.py` (21 tests covering `_extract_json`, `_clamp_rating`, `_parse_rating` with compact/verbose keys, defaults, and clamping) and `tests/test_search.py` (19 tests covering `sanitize_fts_query` with injection attempts, boolean operators, special chars, edge cases). Total: 64 tests all passing. +**Why**: Project had zero CI, no API docs for the web UI, and test coverage only on DB/models. These are prerequisites for public deployment and contributor onboarding. +**Result**: CI workflow ready, API fully documented, test count increased from 23 to 64. All tests pass in 0.6s. + +--- + +### 2026-03-08 CODER — Performance: fix N+1 queries and add caching + +**What**: Four targeted performance fixes across the codebase: +1. **Batch readiness computation** — `compute_readiness_batch()` in `readiness.py` replaces per-draft readiness calls on the drafts page. Bulk-loads ref counts, cited-by counts, author experience, and ratings in ~6 queries total instead of ~200 (4 queries x 50 drafts/page). +2. **Batch draft lookup in author network** — `_compute_author_network_full()` now calls `db.get_drafts_by_names()` once to pre-load all drafts referenced by authors, instead of calling `db.get_draft()` in a loop inside cluster building. +3. **File-based similarity matrix cache** — `Embedder.similarity_matrix()` now caches the O(n^2) cosine similarity matrix to disk (`.cache/` dir next to DB), keyed by SHA256 hash of draft names. Reloads from cache if the set of embedded drafts hasn't changed. +4. **Embeddings cache for search** — `HybridSearch._get_all_embeddings()` caches the result of `db.all_embeddings()` with a 5-minute TTL, avoiding a full DB scan on every search query. +Also added `Database.get_drafts_by_names()` batch method in `db.py` (chunked to stay under SQLite's 999 variable limit). +**Why**: Page loads on the drafts listing and author network pages were slow due to N+1 query patterns. The similarity matrix was recomputed from scratch on every CLI invocation. Search queries redundantly loaded all embeddings from disk. +**Result**: Drafts page: ~200 queries reduced to ~6. Author network cluster building: ~100 `get_draft` calls reduced to 1 batch query. Similarity matrix: cached to disk, skips O(n^2) recomputation when embeddings unchanged. Search: embeddings loaded once per 5 minutes instead of per query. + +--- + +### 2026-03-08 CODER — CLI boilerplate reduction, --dry-run flags, webui import cleanup + +**What**: Three code quality improvements across the CLI and web UI: +1. **CLI boilerplate reduction** — Created a `pass_cfg_db` decorator that extracts `cfg` and `db` from the Click context, replacing ~40 instances of `cfg = _get_config(); db = Database(cfg); try: ... finally: db.close()`. The `main()` group now initializes config/db once and registers `db.close()` via `ctx.call_on_close()`. Converted ~30 commands to use the new pattern (all report, viz, wg, ideas, and core commands). Remaining ~15 read-only commands still use the old pattern but work correctly. +2. **--dry-run on destructive commands** — Added `--dry-run` flag to `analyze`, `embed`, `embed-ideas`, `ideas` (extract), and `gaps`. Each shows what would be processed (draft names, counts) without making API calls or DB changes. Pre-existing dry-run flags on `ideas filter`, `dedup-ideas`, `pipeline generate`, and `observatory update` were preserved. +3. **webui/data.py import cleanup** — Moved 15+ in-function imports to the top of the file: `numpy`, `re`, `sklearn.{TSNE, AgglomerativeClustering, normalize}`, `ietf_analyzer.{readiness, search}`. Fixed `json as _json` alias to use the already-imported `json`. sklearn imports inside try/except blocks (for graceful failure) were moved to top level since sklearn is a required dependency. +**Why**: The CLI had ~800 lines of pure boilerplate. The try/finally pattern was error-prone (easy to forget db.close()). Missing --dry-run on destructive commands made it risky to explore what a command would do. In-function imports in data.py were unnecessary since all dependencies are required. +**Result**: cli.py reduced by ~200 lines of boilerplate. 6 commands now have --dry-run. data.py has clean top-level imports. Both files pass syntax checks and the CLI loads correctly. + +--- + ### 2026-03-08 CODER — Critical fixes: rating clamp, convergence command, blog number correction **What**: Three fixes addressing data integrity and reproducibility: diff --git a/src/ietf_analyzer/cli.py b/src/ietf_analyzer/cli.py index 56020ff..937b324 100644 --- a/src/ietf_analyzer/cli.py +++ b/src/ietf_analyzer/cli.py @@ -21,9 +21,30 @@ def _get_config() -> Config: @click.group() @click.version_option(version="0.2.0") -def main(): +@click.pass_context +def main(ctx): """IETF Draft Analyzer — track, categorize, and rate AI/agent Internet-Drafts.""" - pass + ctx.ensure_object(dict) + cfg = Config.load() + db = Database(cfg) + ctx.obj["cfg"] = cfg + ctx.obj["db"] = db + ctx.call_on_close(db.close) + + +def pass_cfg_db(f): + """Decorator that extracts cfg and db from Click context and passes them as arguments. + + Usage: place @pass_cfg_db after all @click decorators. The decorated function + should accept (cfg, db, ...) instead of manually calling _get_config()/Database(). + """ + import functools + + @click.pass_context + @functools.wraps(f) + def wrapper(ctx, **kwargs): + return f(ctx.obj["cfg"], ctx.obj["db"], **kwargs) + return wrapper # ── fetch ──────────────────────────────────────────────────────────────────── @@ -33,12 +54,12 @@ def main(): @click.option("--keywords", "-k", multiple=True, help="Extra keywords to search for") @click.option("--since", "-s", help="Only fetch drafts newer than this date (YYYY-MM-DD)") @click.option("--download-text/--no-download-text", default=True, help="Download full text of drafts") -def fetch(keywords: tuple[str, ...], since: str | None, download_text: bool): +@click.option("--classify/--no-classify", default=True, help="Pre-filter with local Ollama classifier (saves Claude tokens)") +@pass_cfg_db +def fetch(cfg, db, keywords: tuple[str, ...], since: str | None, download_text: bool, classify: bool): """Fetch AI/agent drafts from IETF Datatracker.""" from .fetcher import Fetcher - cfg = _get_config() - db = Database(cfg) fetcher = Fetcher(cfg) kw_list = list(cfg.search_keywords) @@ -47,6 +68,24 @@ def fetch(keywords: tuple[str, ...], since: str | None, download_text: bool): try: drafts = fetcher.search_drafts(keywords=kw_list, since=since) + console.print(f"Found [bold]{len(drafts)}[/] drafts from Datatracker") + + # Pre-filter with local classifier to avoid storing irrelevant drafts + if classify and drafts: + try: + from .classifier import Classifier + console.print("\n[bold]Running local AI-relevance classifier (Ollama)...[/]") + clf = Classifier(cfg) + draft_dicts = [{"name": d.name, "title": d.title, "abstract": d.abstract} for d in drafts] + relevant, irrelevant = clf.classify_batch(draft_dicts, verbose=True) + relevant_names = {d["name"] for d in relevant} + before = len(drafts) + drafts = [d for d in drafts if d.name in relevant_names] + console.print(f"\n Kept [green]{len(drafts)}[/green] / {before} drafts after classification") + clf.close() + except Exception as e: + console.print(f"[yellow]Classifier unavailable ({e}), storing all drafts[/yellow]") + for draft in drafts: db.upsert_draft(draft) console.print(f"Stored [bold green]{len(drafts)}[/] drafts in database") @@ -63,7 +102,101 @@ def fetch(keywords: tuple[str, ...], since: str | None, download_text: bool): db.upsert_draft(draft) finally: fetcher.close() - db.close() + + +# ── classify ───────────────────────────────────────────────────────────────── + + +@main.command() +@click.option("--unrated", is_flag=True, help="Classify only unrated drafts") +@click.option("--all", "all_drafts", is_flag=True, help="Classify all drafts (checks accuracy against existing ratings)") +@click.option("--remove", is_flag=True, help="Actually remove drafts classified as irrelevant (use with --unrated)") +@pass_cfg_db +def classify(cfg, db, unrated: bool, all_drafts: bool, remove: bool): + """Pre-classify drafts as AI-relevant using local Ollama model. + + Runs a two-stage filter (embedding similarity + chat model) to identify + irrelevant drafts before spending Claude tokens on rating. + + Examples: + + ietf classify --unrated # preview irrelevant unrated drafts + + ietf classify --unrated --remove # remove them from DB + + ietf classify --all # accuracy check against existing ratings + """ + from .classifier import Classifier + + clf = Classifier(cfg) + + if all_drafts: + # Accuracy check mode: compare against existing FP flags + console.print("[bold]Accuracy check: classifying all rated drafts...[/]\n") + tp_rows = db.conn.execute( + "SELECT d.name, d.title, d.abstract FROM drafts d " + "JOIN ratings r ON d.name = r.draft_name WHERE r.false_positive = 0" + ).fetchall() + fp_rows = db.conn.execute( + "SELECT d.name, d.title, d.abstract FROM drafts d " + "JOIN ratings r ON d.name = r.draft_name WHERE r.false_positive = 1" + ).fetchall() + + tp_ok, tp_miss, fp_ok, fp_miss = 0, 0, 0, 0 + for row in tp_rows: + rel, sim, method = clf.classify(row["title"], row["abstract"]) + if rel: + tp_ok += 1 + else: + tp_miss += 1 + for row in fp_rows: + rel, sim, method = clf.classify(row["title"], row["abstract"]) + if not rel: + fp_ok += 1 + else: + fp_miss += 1 + + total_tp = len(tp_rows) + total_fp = len(fp_rows) + precision = tp_ok / (tp_ok + fp_miss) if (tp_ok + fp_miss) else 0 + recall = tp_ok / total_tp if total_tp else 0 + console.print(f"True Positives: [green]{tp_ok}[/]/{total_tp} kept ({tp_miss} missed)") + console.print(f"False Positives: [red]{fp_ok}[/]/{total_fp} filtered ({fp_miss} slipped)") + console.print(f"Precision: [bold]{precision:.1%}[/] Recall: [bold]{recall:.1%}[/]") + + elif unrated: + drafts = db.unrated_drafts(limit=5000) + if not drafts: + console.print("No unrated drafts to classify.") + clf.close() + return + + console.print(f"[bold]Classifying {len(drafts)} unrated drafts...[/]\n") + draft_dicts = [{"name": d.name, "title": d.title, "abstract": d.abstract} for d in drafts] + relevant, irrelevant = clf.classify_batch(draft_dicts, verbose=True) + + if irrelevant: + console.print(f"\n[bold red]Irrelevant drafts ({len(irrelevant)}):[/]") + table = Table() + table.add_column("Name", style="cyan", max_width=50) + table.add_column("Title", max_width=50) + for d in irrelevant: + table.add_row(d["name"], d.get("title", "")[:50]) + console.print(table) + + if remove: + for d in irrelevant: + db.conn.execute("DELETE FROM drafts WHERE name = ?", (d["name"],)) + db.conn.commit() + console.print(f"\n[bold red]Removed {len(irrelevant)} irrelevant drafts from database[/]") + else: + console.print(f"\n[dim]Use --remove to delete these from the DB[/]") + else: + console.print("\nAll unrated drafts appear relevant.") + else: + console.print("Use --unrated or --all. See: ietf classify --help") + + clf.close() # ── list ───────────────────────────────────────────────────────────────────── @@ -72,30 +205,26 @@ def fetch(keywords: tuple[str, ...], since: str | None, download_text: bool): @main.command("list") @click.option("--limit", "-n", default=30, help="Number of drafts to show") @click.option("--sort", "-s", default="time DESC", help="Sort order (e.g. 'time DESC', 'name ASC')") -def list_drafts(limit: int, sort: str): +@pass_cfg_db +def list_drafts(cfg, db, limit: int, sort: str): """List tracked drafts.""" - cfg = _get_config() - db = Database(cfg) - try: - drafts = db.list_drafts(limit=limit, order_by=sort) - total = db.count_drafts() + drafts = db.list_drafts(limit=limit, order_by=sort) + total = db.count_drafts() - table = Table(title=f"Tracked Drafts ({total} total, showing {len(drafts)})") - table.add_column("Date", style="dim", width=10) - table.add_column("Name", style="cyan", max_width=55) - table.add_column("Title", max_width=50) - table.add_column("Pg", justify="right", width=4) - table.add_column("Text", justify="center", width=4) - table.add_column("Rated", justify="center", width=5) + table = Table(title=f"Tracked Drafts ({total} total, showing {len(drafts)})") + table.add_column("Date", style="dim", width=10) + table.add_column("Name", style="cyan", max_width=55) + table.add_column("Title", max_width=50) + table.add_column("Pg", justify="right", width=4) + table.add_column("Text", justify="center", width=4) + table.add_column("Rated", justify="center", width=5) - for d in drafts: - has_text = "\u2713" if d.full_text else "" - rated = "\u2713" if db.get_rating(d.name) else "" - table.add_row(d.date, d.name, d.title[:50], str(d.pages or ""), has_text, rated) + for d in drafts: + has_text = "\u2713" if d.full_text else "" + rated = "\u2713" if db.get_rating(d.name) else "" + table.add_row(d.date, d.name, d.title[:50], str(d.pages or ""), has_text, rated) - console.print(table) - finally: - db.close() + console.print(table) # ── search ─────────────────────────────────────────────────────────────────── @@ -104,27 +233,23 @@ def list_drafts(limit: int, sort: str): @main.command() @click.argument("query") @click.option("--limit", "-n", default=20, help="Max results") -def search(query: str, limit: int): +@pass_cfg_db +def search(cfg, db, query: str, limit: int): """Full-text search across stored drafts.""" - cfg = _get_config() - db = Database(cfg) - try: - results = db.search_drafts(query, limit=limit) - if not results: - console.print(f"No results for [bold]{query}[/]") - return + results = db.search_drafts(query, limit=limit) + if not results: + console.print(f"No results for [bold]{query}[/]") + return - table = Table(title=f"Search: {query} ({len(results)} results)") - table.add_column("Date", style="dim", width=10) - table.add_column("Name", style="cyan") - table.add_column("Title") + table = Table(title=f"Search: {query} ({len(results)} results)") + table.add_column("Date", style="dim", width=10) + table.add_column("Name", style="cyan") + table.add_column("Title") - for d in results: - table.add_row(d.date, d.name, d.title[:60]) + for d in results: + table.add_row(d.date, d.name, d.title[:60]) - console.print(table) - finally: - db.close() + console.print(table) # ── show ───────────────────────────────────────────────────────────────────── @@ -132,67 +257,63 @@ def search(query: str, limit: int): @main.command() @click.argument("name") -def show(name: str): +@pass_cfg_db +def show(cfg, db, name: str): """Show detailed info for a draft.""" from .reports import Reporter + from .readiness import compute_readiness - cfg = _get_config() - db = Database(cfg) reporter = Reporter(cfg, db) - try: - draft = db.get_draft(name) - if draft is None: - console.print(f"[red]Draft not found: {name}[/]") - return + draft = db.get_draft(name) + if draft is None: + console.print(f"[red]Draft not found: {name}[/]") + return - rating = db.get_rating(name) + rating = db.get_rating(name) - console.print(f"\n[bold]{draft.title}[/]") - console.print(f"[dim]{draft.name}[/] rev {draft.rev} | {draft.date} | {draft.pages or '?'} pages") - console.print(f"Group: {draft.group or 'individual'} | {draft.datatracker_url}") - console.print(f"\n[italic]{draft.abstract}[/]\n") + console.print(f"\n[bold]{draft.title}[/]") + console.print(f"[dim]{draft.name}[/] rev {draft.rev} | {draft.date} | {draft.pages or '?'} pages") + console.print(f"Group: {draft.group or 'individual'} | {draft.datatracker_url}") + console.print(f"\n[italic]{draft.abstract}[/]\n") - if rating: - console.print("[bold]AI Assessment[/]") - console.print(f" Score: [bold green]{rating.composite_score:.1f}[/]") - console.print(f" Summary: {rating.summary}\n") + if rating: + console.print("[bold]AI Assessment[/]") + console.print(f" Score: [bold green]{rating.composite_score:.1f}[/]") + console.print(f" Summary: {rating.summary}\n") - table = Table(show_header=True) - table.add_column("Dimension", width=12) - table.add_column("Score", justify="center", width=7) - table.add_column("Notes") - table.add_row("Novelty", f"{rating.novelty}/5", rating.novelty_note) - table.add_row("Maturity", f"{rating.maturity}/5", rating.maturity_note) - table.add_row("Overlap", f"{rating.overlap}/5", rating.overlap_note) - table.add_row("Momentum", f"{rating.momentum}/5", rating.momentum_note) - table.add_row("Relevance", f"{rating.relevance}/5", rating.relevance_note) - console.print(table) + table = Table(show_header=True) + table.add_column("Dimension", width=12) + table.add_column("Score", justify="center", width=7) + table.add_column("Notes") + table.add_row("Novelty", f"{rating.novelty}/5", rating.novelty_note) + table.add_row("Maturity", f"{rating.maturity}/5", rating.maturity_note) + table.add_row("Overlap", f"{rating.overlap}/5", rating.overlap_note) + table.add_row("Momentum", f"{rating.momentum}/5", rating.momentum_note) + table.add_row("Relevance", f"{rating.relevance}/5", rating.relevance_note) + console.print(table) - if rating.categories: - console.print(f"\nCategories: {', '.join(rating.categories)}") - else: - console.print("[dim]Not yet rated — run: ietf analyze {name}[/]") + if rating.categories: + console.print(f"\nCategories: {', '.join(rating.categories)}") + else: + console.print("[dim]Not yet rated — run: ietf analyze {name}[/]") - # Readiness score - from .readiness import compute_readiness - readiness = compute_readiness(db, name) - if readiness["score"] > 0: - console.print(f"\n[bold]Standards Readiness: [cyan]{readiness['score']}/100[/][/]") - rtable = Table(show_header=True) - rtable.add_column("Factor", width=20) - rtable.add_column("Value", justify="center", width=10) - rtable.add_column("Points", justify="right", width=8) - rtable.add_column("Detail") - for key, f in readiness["factors"].items(): - rtable.add_row(f["label"], f"{f['value']:.2f}", f"+{f['contribution']}", f["detail"]) - console.print(rtable) + # Readiness score + readiness = compute_readiness(db, name) + if readiness["score"] > 0: + console.print(f"\n[bold]Standards Readiness: [cyan]{readiness['score']}/100[/][/]") + rtable = Table(show_header=True) + rtable.add_column("Factor", width=20) + rtable.add_column("Value", justify="center", width=10) + rtable.add_column("Points", justify="right", width=8) + rtable.add_column("Detail") + for key, f in readiness["factors"].items(): + rtable.add_row(f["label"], f"{f['value']:.2f}", f"+{f['contribution']}", f["detail"]) + console.print(rtable) - # Save detailed report too - path = reporter.draft_detail(name) - if path: - console.print(f"\n[dim]Report saved: {path}[/]") - finally: - db.close() + # Save detailed report too + path = reporter.draft_detail(name) + if path: + console.print(f"\n[dim]Report saved: {path}[/]") # ── annotate ───────────────────────────────────────────────────────────────── @@ -203,46 +324,42 @@ def show(name: str): @click.option("--note", "-n", default=None, help="Set/update the note text") @click.option("--tag", "-t", multiple=True, help="Add a tag (can be used multiple times)") @click.option("--remove-tag", "-r", multiple=True, help="Remove a tag (can be used multiple times)") -def annotate(draft_name: str, note: str | None, tag: tuple[str, ...], remove_tag: tuple[str, ...]): +@pass_cfg_db +def annotate(cfg, db, draft_name: str, note: str | None, tag: tuple[str, ...], remove_tag: tuple[str, ...]): """Add or view annotations (notes & tags) for a draft.""" - cfg = _get_config() - db = Database(cfg) - try: - draft = db.get_draft(draft_name) - if draft is None: - console.print(f"[red]Draft not found: {draft_name}[/]") - return + draft = db.get_draft(draft_name) + if draft is None: + console.print(f"[red]Draft not found: {draft_name}[/]") + return - # If no options, display current annotation - if note is None and not tag and not remove_tag: - ann = db.get_annotation(draft_name) - if ann: - console.print(f"\n[bold]Annotation for {draft_name}[/]") - console.print(f" Note: {ann['note'] or '(empty)'}") - console.print(f" Tags: {', '.join(ann['tags']) if ann['tags'] else '(none)'}") - console.print(f" Updated: {ann['updated_at']}") - else: - console.print(f"[dim]No annotation for {draft_name}. Use --note or --tag to add one.[/]") - return - - # Fetch existing tags for add/remove operations - existing = db.get_annotation(draft_name) - current_tags = existing["tags"] if existing else [] - - for t in tag: - if t not in current_tags: - current_tags.append(t) - for t in remove_tag: - if t in current_tags: - current_tags.remove(t) - - db.upsert_annotation(draft_name, note=note, tags=current_tags) + # If no options, display current annotation + if note is None and not tag and not remove_tag: ann = db.get_annotation(draft_name) - console.print(f"[green]Annotation updated for {draft_name}[/]") - console.print(f" Note: {ann['note'] or '(empty)'}") - console.print(f" Tags: {', '.join(ann['tags']) if ann['tags'] else '(none)'}") - finally: - db.close() + if ann: + console.print(f"\n[bold]Annotation for {draft_name}[/]") + console.print(f" Note: {ann['note'] or '(empty)'}") + console.print(f" Tags: {', '.join(ann['tags']) if ann['tags'] else '(none)'}") + console.print(f" Updated: {ann['updated_at']}") + else: + console.print(f"[dim]No annotation for {draft_name}. Use --note or --tag to add one.[/]") + return + + # Fetch existing tags for add/remove operations + existing = db.get_annotation(draft_name) + current_tags = existing["tags"] if existing else [] + + for t in tag: + if t not in current_tags: + current_tags.append(t) + for t in remove_tag: + if t in current_tags: + current_tags.remove(t) + + db.upsert_annotation(draft_name, note=note, tags=current_tags) + ann = db.get_annotation(draft_name) + console.print(f"[green]Annotation updated for {draft_name}[/]") + console.print(f" Note: {ann['note'] or '(empty)'}") + console.print(f" Tags: {', '.join(ann['tags']) if ann['tags'] else '(none)'}") # ── analyze ────────────────────────────────────────────────────────────────── @@ -253,55 +370,110 @@ def annotate(draft_name: str, note: str | None, tag: tuple[str, ...], remove_tag @click.option("--all", "analyze_all", is_flag=True, help="Analyze all unrated drafts") @click.option("--limit", "-n", default=50, help="Max drafts to analyze (with --all)") @click.option("--retry-failed", is_flag=True, help="Re-analyze drafts that previously failed (clears cache)") -def analyze(name: str | None, analyze_all: bool, limit: int, retry_failed: bool): - """Analyze and rate drafts using Claude.""" +@click.option("--dry-run", is_flag=True, help="Show what would be analyzed without making changes") +@click.option("--pre-classify/--no-pre-classify", "pre_classify", default=False, + help="Pre-filter unrated drafts with local Ollama classifier before Claude") +@pass_cfg_db +def analyze(cfg, db, name: str | None, analyze_all: bool, limit: int, retry_failed: bool, dry_run: bool, pre_classify: bool): + """Analyze and rate drafts using Claude. + + Use --pre-classify to run the local Ollama classifier first, removing + irrelevant drafts before spending Claude tokens. Saves ~40% of API costs. + """ from .analyzer import Analyzer - cfg = _get_config() - db = Database(cfg) - analyzer = Analyzer(cfg, db) - - try: + if dry_run: if retry_failed: - # Find drafts that have cache entries but no ratings (failed analyses) unrated = db.unrated_drafts(limit=limit) retryable = [] for draft in unrated: - # Check if there's a cache entry for this draft (it was attempted) row = db.conn.execute( "SELECT COUNT(*) FROM llm_cache WHERE draft_name = ?", (draft.name,), ).fetchone() if row[0] > 0: retryable.append(draft) - if not retryable: - console.print("No previously failed drafts to retry.") - else: - console.print(f"Retrying [bold]{len(retryable)}[/] previously failed drafts...") - count = 0 - for draft in retryable: - rating = analyzer.rate_draft(draft.name, use_cache=False) - if rating: - count += 1 - console.print(f"Successfully re-analyzed [bold green]{count}[/] of {len(retryable)} drafts") + console.print(f"[bold yellow]DRY RUN[/]: Would retry [bold]{len(retryable)}[/] previously failed drafts") + for d in retryable[:20]: + console.print(f" - {d.name}") + if len(retryable) > 20: + console.print(f" ... and {len(retryable) - 20} more") elif analyze_all: - count = analyzer.rate_all_unrated(limit=limit) - console.print(f"Analyzed [bold green]{count}[/] drafts") + unrated = db.unrated_drafts(limit=limit) + console.print(f"[bold yellow]DRY RUN[/]: Would analyze [bold]{len(unrated)}[/] unrated drafts") + for d in unrated[:20]: + console.print(f" - {d.name}: {d.title[:60]}") + if len(unrated) > 20: + console.print(f" ... and {len(unrated) - 20} more") elif name: - rating = analyzer.rate_draft(name) - if rating: - console.print(f"\n[bold green]Rating for {name}:[/]") - console.print(f" Score: {rating.composite_score:.1f}") - console.print(f" Summary: {rating.summary}") - console.print(f" Novelty={rating.novelty} Maturity={rating.maturity} " - f"Overlap={rating.overlap} Momentum={rating.momentum} " - f"Relevance={rating.relevance}") - else: - console.print("[red]Analysis failed[/]") + existing = db.get_rating(name) + status = "re-analyze (already rated)" if existing else "analyze (not yet rated)" + console.print(f"[bold yellow]DRY RUN[/]: Would {status}: {name}") else: console.print("Provide a draft name or use --all") - finally: - db.close() + return + + analyzer = Analyzer(cfg, db) + + if retry_failed: + # Find drafts that have cache entries but no ratings (failed analyses) + unrated = db.unrated_drafts(limit=limit) + retryable = [] + for draft in unrated: + # Check if there's a cache entry for this draft (it was attempted) + row = db.conn.execute( + "SELECT COUNT(*) FROM llm_cache WHERE draft_name = ?", + (draft.name,), + ).fetchone() + if row[0] > 0: + retryable.append(draft) + if not retryable: + console.print("No previously failed drafts to retry.") + else: + console.print(f"Retrying [bold]{len(retryable)}[/] previously failed drafts...") + count = 0 + for draft in retryable: + rating = analyzer.rate_draft(draft.name, use_cache=False) + if rating: + count += 1 + console.print(f"Successfully re-analyzed [bold green]{count}[/] of {len(retryable)} drafts") + elif analyze_all: + if pre_classify: + # Pre-filter with local Ollama classifier + try: + from .classifier import Classifier + unrated = db.unrated_drafts(limit=limit) + if unrated: + console.print(f"\n[bold]Pre-classifying {len(unrated)} unrated drafts with Ollama...[/]") + clf = Classifier(cfg) + draft_dicts = [{"name": d.name, "title": d.title, "abstract": d.abstract} for d in unrated] + relevant, irrelevant = clf.classify_batch(draft_dicts, verbose=True) + clf.close() + + if irrelevant: + console.print(f"\n Removing [red]{len(irrelevant)}[/] irrelevant drafts from DB...") + for d in irrelevant: + db.conn.execute("DELETE FROM drafts WHERE name = ?", (d["name"],)) + db.conn.commit() + console.print(f" Removed. {len(relevant)} drafts remain for Claude analysis.\n") + except Exception as e: + console.print(f"[yellow]Classifier unavailable ({e}), analyzing all[/yellow]") + + count = analyzer.rate_all_unrated(limit=limit) + console.print(f"Analyzed [bold green]{count}[/] drafts") + elif name: + rating = analyzer.rate_draft(name) + if rating: + console.print(f"\n[bold green]Rating for {name}:[/]") + console.print(f" Score: {rating.composite_score:.1f}") + console.print(f" Summary: {rating.summary}") + console.print(f" Novelty={rating.novelty} Maturity={rating.maturity} " + f"Overlap={rating.overlap} Momentum={rating.momentum} " + f"Relevance={rating.relevance}") + else: + console.print("[red]Analysis failed[/]") + else: + console.print("Provide a draft name or use --all") # ── ask ────────────────────────────────────────────────────────────────────── @@ -311,7 +483,8 @@ def analyze(name: str | None, analyze_all: bool, limit: int, retry_failed: bool) @click.argument("question") @click.option("--top", "-n", default=5, help="Number of source drafts to use") @click.option("--cheap/--quality", default=True, help="Use Haiku (cheap) vs Sonnet (quality)") -def ask(question: str, top: int, cheap: bool): +@pass_cfg_db +def ask(cfg, db, question: str, top: int, cheap: bool): """Ask a natural language question about the drafts. Examples: @@ -321,43 +494,37 @@ def ask(question: str, top: int, cheap: bool): """ from .search import HybridSearch - cfg = _get_config() - db = Database(cfg) + searcher = HybridSearch(cfg, db) + console.print(f"\n[dim]Searching for relevant drafts...[/]") + result = searcher.ask(question, top_k=top, cheap=cheap) - try: - searcher = HybridSearch(cfg, db) - console.print(f"\n[dim]Searching for relevant drafts...[/]") - result = searcher.ask(question, top_k=top, cheap=cheap) + # Display the answer + console.print() + console.print("[bold cyan]Answer[/]") + console.print("[dim]" + "-" * 60 + "[/]") + console.print(result["answer"]) + console.print() - # Display the answer - console.print() - console.print("[bold cyan]Answer[/]") - console.print("[dim]" + "-" * 60 + "[/]") - console.print(result["answer"]) - console.print() + # Display source drafts table + if result["sources"]: + table = Table(title="Source Drafts") + table.add_column("#", style="dim", width=3) + table.add_column("Draft", style="cyan", max_width=50) + table.add_column("Title", max_width=45) + table.add_column("Match", width=10) + table.add_column("Score", justify="right", width=8) - # Display source drafts table - if result["sources"]: - table = Table(title="Source Drafts") - table.add_column("#", style="dim", width=3) - table.add_column("Draft", style="cyan", max_width=50) - table.add_column("Title", max_width=45) - table.add_column("Match", width=10) - table.add_column("Score", justify="right", width=8) + for i, src in enumerate(result["sources"], 1): + score_str = f"{src['similarity']:.3f}" if src.get("similarity") else "-" + table.add_row( + str(i), + src["name"], + src["title"][:45], + src.get("match_type", ""), + score_str, + ) - for i, src in enumerate(result["sources"], 1): - score_str = f"{src['similarity']:.3f}" if src.get("similarity") else "-" - table.add_row( - str(i), - src["name"], - src["title"][:45], - src.get("match_type", ""), - score_str, - ) - - console.print(table) - finally: - db.close() + console.print(table) # ── compare ────────────────────────────────────────────────────────────────── @@ -365,43 +532,44 @@ def ask(question: str, top: int, cheap: bool): @main.command() @click.argument("names", nargs=-1, required=True) -def compare(names: tuple[str, ...]): +@pass_cfg_db +def compare(cfg, db, names: tuple[str, ...]): """Compare multiple drafts for overlap and unique contributions.""" from .analyzer import Analyzer - cfg = _get_config() - db = Database(cfg) analyzer = Analyzer(cfg, db) - try: - result = analyzer.compare_drafts(list(names)) - if "error" in result: - console.print(f"[red]{result['error']}[/]") - else: - console.print(f"\n[bold cyan]Comparison of {len(result['drafts'])} drafts[/]") - console.print("[dim]" + "-" * 60 + "[/]") - console.print(result["text"]) - finally: - db.close() + result = analyzer.compare_drafts(list(names)) + if "error" in result: + console.print(f"[red]{result['error']}[/]") + else: + console.print(f"\n[bold cyan]Comparison of {len(result['drafts'])} drafts[/]") + console.print("[dim]" + "-" * 60 + "[/]") + console.print(result["text"]) # ── embed ──────────────────────────────────────────────────────────────────── @main.command() -def embed(): +@click.option("--dry-run", is_flag=True, help="Show what would be embedded without making changes") +@pass_cfg_db +def embed(cfg, db, dry_run: bool): """Generate embeddings for all drafts (requires Ollama).""" + if dry_run: + missing = db.drafts_without_embeddings(limit=10000) + console.print(f"[bold yellow]DRY RUN[/]: Would embed [bold]{len(missing)}[/] drafts") + for name in missing[:20]: + console.print(f" - {name}") + if len(missing) > 20: + console.print(f" ... and {len(missing) - 20} more") + return + from .embeddings import Embedder - cfg = _get_config() - db = Database(cfg) embedder = Embedder(cfg, db) - - try: - count = embedder.embed_all_missing() - console.print(f"Embedded [bold green]{count}[/] drafts") - finally: - db.close() + count = embedder.embed_all_missing() + console.print(f"Embedded [bold green]{count}[/] drafts") # ── embed-ideas ────────────────────────────────────────────────────────────── @@ -410,52 +578,56 @@ def embed(): @main.command("embed-ideas") @click.option("--limit", default=0, help="Max ideas to embed (0=all)") @click.option("--batch-size", default=50, help="Batch size for Ollama") -def embed_ideas(limit: int, batch_size: int): +@click.option("--dry-run", is_flag=True, help="Show what would be embedded without making changes") +@pass_cfg_db +def embed_ideas(cfg, db, limit: int, batch_size: int, dry_run: bool): """Generate embeddings for extracted ideas via Ollama.""" + missing = db.ideas_without_embeddings(limit=limit if limit > 0 else 10000) + if not missing: + console.print("All ideas already have embeddings.") + return + + if dry_run: + console.print(f"[bold yellow]DRY RUN[/]: Would embed [bold]{len(missing)}[/] ideas in batches of {batch_size}") + for idea in missing[:20]: + console.print(f" - [{idea.get('id', '?')}] {idea['title'][:60]}") + if len(missing) > 20: + console.print(f" ... and {len(missing) - 20} more") + return + + import numpy as np import ollama as ollama_lib from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn - cfg = _get_config() - db = Database(cfg) client = ollama_lib.Client(host=cfg.ollama_url) + total = len(missing) + console.print(f"Embedding [bold]{total}[/] ideas in batches of {batch_size}...") - try: - missing = db.ideas_without_embeddings(limit=limit if limit > 0 else 10000) - if not missing: - console.print("All ideas already have embeddings.") - return + count = 0 + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + console=console, + ) as progress: + task = progress.add_task("Embedding ideas...", total=total) + for start in range(0, total, batch_size): + batch = missing[start:start + batch_size] + texts = [f"{idea['title']}. {idea['description']}" for idea in batch] + try: + resp = client.embed(model=cfg.ollama_embed_model, input=texts) + for i, idea in enumerate(batch): + vec = np.array(resp["embeddings"][i], dtype=np.float32) + db.store_idea_embedding(idea["id"], cfg.ollama_embed_model, vec) + count += 1 + progress.advance(task) + except Exception as e: + console.print(f"[red]Batch failed: {e}[/]") + for _ in batch: + progress.advance(task) - total = len(missing) - console.print(f"Embedding [bold]{total}[/] ideas in batches of {batch_size}...") - - count = 0 - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - MofNCompleteColumn(), - console=console, - ) as progress: - task = progress.add_task("Embedding ideas...", total=total) - for start in range(0, total, batch_size): - batch = missing[start:start + batch_size] - texts = [f"{idea['title']}. {idea['description']}" for idea in batch] - try: - resp = client.embed(model=cfg.ollama_embed_model, input=texts) - for i, idea in enumerate(batch): - import numpy as np - vec = np.array(resp["embeddings"][i], dtype=np.float32) - db.store_idea_embedding(idea["id"], cfg.ollama_embed_model, vec) - count += 1 - progress.advance(task) - except Exception as e: - console.print(f"[red]Batch failed: {e}[/]") - for _ in batch: - progress.advance(task) - - console.print(f"Embedded [bold green]{count}[/] ideas") - finally: - db.close() + console.print(f"Embedded [bold green]{count}[/] ideas") # ── similar ────────────────────────────────────────────────────────────────── @@ -464,33 +636,28 @@ def embed_ideas(limit: int, batch_size: int): @main.command() @click.argument("name") @click.option("--top", "-n", default=10, help="Number of similar drafts to show") -def similar(name: str, top: int): +@pass_cfg_db +def similar(cfg, db, name: str, top: int): """Find drafts most similar to a given draft.""" from .embeddings import Embedder - cfg = _get_config() - db = Database(cfg) embedder = Embedder(cfg, db) + results = embedder.find_similar(name, top_n=top) + if not results: + console.print(f"[yellow]No similar drafts found (need embeddings — run `ietf embed` first)[/]") + return - try: - results = embedder.find_similar(name, top_n=top) - if not results: - console.print(f"[yellow]No similar drafts found (need embeddings — run `ietf embed` first)[/]") - return + table = Table(title=f"Drafts similar to {name}") + table.add_column("Similarity", justify="right", width=10) + table.add_column("Draft", style="cyan") + table.add_column("Title") - table = Table(title=f"Drafts similar to {name}") - table.add_column("Similarity", justify="right", width=10) - table.add_column("Draft", style="cyan") - table.add_column("Title") + for sim_name, score in results: + draft = db.get_draft(sim_name) + title = draft.title[:60] if draft else "" + table.add_row(f"{score:.3f}", sim_name, title) - for sim_name, score in results: - draft = db.get_draft(sim_name) - title = draft.title[:60] if draft else "" - table.add_row(f"{score:.3f}", sim_name, title) - - console.print(table) - finally: - db.close() + console.print(table) # ── clusters ───────────────────────────────────────────────────────────────── @@ -498,30 +665,25 @@ def similar(name: str, top: int): @main.command() @click.option("--threshold", "-t", default=0.85, help="Similarity threshold for clustering") -def clusters(threshold: float): +@pass_cfg_db +def clusters(cfg, db, threshold: float): """Find clusters of highly similar (potentially overlapping) drafts.""" from .embeddings import Embedder - cfg = _get_config() - db = Database(cfg) embedder = Embedder(cfg, db) + cluster_list = embedder.find_clusters(threshold=threshold) + if not cluster_list: + console.print("No clusters found at this threshold.") + return - try: - cluster_list = embedder.find_clusters(threshold=threshold) - if not cluster_list: - console.print("No clusters found at this threshold.") - return - - console.print(f"\n[bold]Found {len(cluster_list)} clusters[/] (threshold={threshold})\n") - for i, cluster in enumerate(cluster_list, 1): - console.print(f"[bold cyan]Cluster {i}[/] ({len(cluster)} drafts):") - for name in cluster: - draft = db.get_draft(name) - title = draft.title[:60] if draft else "" - console.print(f" - {name} [dim]{title}[/]") - console.print() - finally: - db.close() + console.print(f"\n[bold]Found {len(cluster_list)} clusters[/] (threshold={threshold})\n") + for i, cluster in enumerate(cluster_list, 1): + console.print(f"[bold cyan]Cluster {i}[/] ({len(cluster)} drafts):") + for name in cluster: + draft = db.get_draft(name) + title = draft.title[:60] if draft else "" + console.print(f" - {name} [dim]{title}[/]") + console.print() # ── report ─────────────────────────────────────────────────────────────────── @@ -534,218 +696,143 @@ def report(): @report.command() -def overview(): +@pass_cfg_db +def overview(cfg, db): """Overview table of all rated drafts.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.overview() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).overview() + console.print(f"Report saved: [bold]{path}[/]") @report.command() -def landscape(): +@pass_cfg_db +def landscape(cfg, db): """Category-grouped landscape view.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.landscape() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).landscape() + console.print(f"Report saved: [bold]{path}[/]") @report.command() @click.option("--days", "-d", default=7, help="Look back N days") -def digest(days: int): +@pass_cfg_db +def digest(cfg, db, days: int): """What's new digest.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.digest(since_days=days) - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).digest(since_days=days) + console.print(f"Report saved: [bold]{path}[/]") @report.command() -def timeline(): +@pass_cfg_db +def timeline(cfg, db): """Timeline of draft submissions by month and category.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.timeline() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).timeline() + console.print(f"Report saved: [bold]{path}[/]") @report.command("overlap-matrix") -def overlap_matrix(): +@pass_cfg_db +def overlap_matrix(cfg, db): """Full pairwise overlap matrix report.""" from .embeddings import Embedder from .reports import Reporter - cfg = _get_config() - db = Database(cfg) embedder = Embedder(cfg, db) - reporter = Reporter(cfg, db) - try: - n_drafts = len(db.all_drafts()) - console.print(f"Computing {n_drafts}x{n_drafts} similarity matrix...") - path = reporter.overlap_matrix(embedder) - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + n_drafts = len(db.all_drafts()) + console.print(f"Computing {n_drafts}x{n_drafts} similarity matrix...") + path = Reporter(cfg, db).overlap_matrix(embedder) + console.print(f"Report saved: [bold]{path}[/]") @report.command("authors") -def authors_report(): +@pass_cfg_db +def authors_report(cfg, db): """Author and organization network report.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.authors_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).authors_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("ideas") -def ideas_report(): +@pass_cfg_db +def ideas_report(cfg, db): """Report on extracted technical ideas.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.ideas_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).ideas_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("refs") -def refs_report(): +@pass_cfg_db +def refs_report(cfg, db): """Cross-reference report — which standards the ecosystem builds on.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.refs_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).refs_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("trends") -def trends_report(): +@pass_cfg_db +def trends_report(cfg, db): """Category trend analysis report (markdown).""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.trends_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).trends_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("idea-overlap") -def idea_overlap_report(): +@pass_cfg_db +def idea_overlap_report(cfg, db): """Cross-organization idea overlap report.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.idea_overlap_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).idea_overlap_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("status") -def status_report(): +@pass_cfg_db +def status_report(cfg, db): """WG adoption status report.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.status_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).status_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("revisions") -def revisions_report(): +@pass_cfg_db +def revisions_report(cfg, db): """Draft revision velocity report.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.revisions_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).revisions_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("centrality") -def centrality_report(): +@pass_cfg_db +def centrality_report(cfg, db): """Author network centrality report.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.centrality_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).centrality_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("co-occurrence") -def co_occurrence_report(): +@pass_cfg_db +def co_occurrence_report(cfg, db): """Category co-occurrence matrix report.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.co_occurrence_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).co_occurrence_report() + console.print(f"Report saved: [bold]{path}[/]") @report.command("wg") -def wg_report(): +@pass_cfg_db +def wg_report(cfg, db): """Working group analysis report — overlaps, alignment, submission targets.""" from .reports import Reporter - cfg = _get_config() - db = Database(cfg) - reporter = Reporter(cfg, db) - try: - path = reporter.wg_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = Reporter(cfg, db).wg_report() + console.print(f"Report saved: [bold]{path}[/]") # ── wg (working group analysis) ───────────────────────────────────────── @@ -759,234 +846,214 @@ def wg(): @wg.command("list") @click.option("--min-drafts", default=1, help="Minimum drafts to show a WG") -def wg_list(min_drafts: int): +@pass_cfg_db +def wg_list(cfg, db, min_drafts: int): """List working groups with draft counts and average scores.""" - cfg = _get_config() - db = Database(cfg) - try: - summaries = db.wg_summary() - if not summaries: - console.print("[yellow]No WG data. Run: python scripts/backfill-wg-names.py[/]") - return + summaries = db.wg_summary() + if not summaries: + console.print("[yellow]No WG data. Run: python scripts/backfill-wg-names.py[/]") + return - summaries = [s for s in summaries if s["draft_count"] >= min_drafts] + summaries = [s for s in summaries if s["draft_count"] >= min_drafts] - table = Table(title=f"Working Groups ({len(summaries)} with >= {min_drafts} drafts)") - table.add_column("WG", style="cyan", width=12) - table.add_column("#", justify="right", width=4) - table.add_column("Ideas", justify="right", width=5) - table.add_column("Nov", justify="center", width=4) - table.add_column("Mat", justify="center", width=4) - table.add_column("Ovl", justify="center", width=4) - table.add_column("Mom", justify="center", width=4) - table.add_column("Rel", justify="center", width=4) - table.add_column("Top Categories") + table = Table(title=f"Working Groups ({len(summaries)} with >= {min_drafts} drafts)") + table.add_column("WG", style="cyan", width=12) + table.add_column("#", justify="right", width=4) + table.add_column("Ideas", justify="right", width=5) + table.add_column("Nov", justify="center", width=4) + table.add_column("Mat", justify="center", width=4) + table.add_column("Ovl", justify="center", width=4) + table.add_column("Mom", justify="center", width=4) + table.add_column("Rel", justify="center", width=4) + table.add_column("Top Categories") - for s in summaries: - top_cats = sorted(s["categories"].items(), key=lambda x: x[1], reverse=True)[:3] - cats_str = ", ".join(f"{c}({n})" for c, n in top_cats) if top_cats else "-" - table.add_row( - s["wg"], str(s["draft_count"]), str(s["idea_count"]), - str(s["avg_novelty"]), str(s["avg_maturity"]), - str(s["avg_overlap"]), str(s["avg_momentum"]), - str(s["avg_relevance"]), cats_str, - ) + for s in summaries: + top_cats = sorted(s["categories"].items(), key=lambda x: x[1], reverse=True)[:3] + cats_str = ", ".join(f"{c}({n})" for c, n in top_cats) if top_cats else "-" + table.add_row( + s["wg"], str(s["draft_count"]), str(s["idea_count"]), + str(s["avg_novelty"]), str(s["avg_maturity"]), + str(s["avg_overlap"]), str(s["avg_momentum"]), + str(s["avg_relevance"]), cats_str, + ) - console.print(table) + console.print(table) - # Also show individual submission count - indiv = db.conn.execute( - 'SELECT COUNT(*) FROM drafts WHERE "group" = \'none\' OR "group" IS NULL' - ).fetchone()[0] - console.print(f"\n[dim]Individual submissions (no WG): {indiv}[/]") - finally: - db.close() + # Also show individual submission count + indiv = db.conn.execute( + 'SELECT COUNT(*) FROM drafts WHERE "group" = \'none\' OR "group" IS NULL' + ).fetchone()[0] + console.print(f"\n[dim]Individual submissions (no WG): {indiv}[/]") @wg.command("show") @click.argument("name") -def wg_show(name: str): +@pass_cfg_db +def wg_show(cfg, db, name: str): """Show details for a specific working group.""" - cfg = _get_config() - db = Database(cfg) - try: - drafts = db.wg_drafts(name) - if not drafts: - console.print(f"[red]No drafts found for WG: {name}[/]") - return + drafts = db.wg_drafts(name) + if not drafts: + console.print(f"[red]No drafts found for WG: {name}[/]") + return - console.print(f"\n[bold]Working Group: {name}[/] ({len(drafts)} drafts)\n") + console.print(f"\n[bold]Working Group: {name}[/] ({len(drafts)} drafts)\n") - table = Table() - table.add_column("Date", style="dim", width=10) - table.add_column("Name", style="cyan") - table.add_column("Title", max_width=50) - table.add_column("Score", justify="right", width=6) + table = Table() + table.add_column("Date", style="dim", width=10) + table.add_column("Name", style="cyan") + table.add_column("Title", max_width=50) + table.add_column("Score", justify="right", width=6) - for d in drafts: - rating = db.get_rating(d.name) - score = f"{rating.composite_score:.1f}" if rating else "-" - table.add_row(d.date, d.name, d.title[:50], score) + for d in drafts: + rating = db.get_rating(d.name) + score = f"{rating.composite_score:.1f}" if rating else "-" + table.add_row(d.date, d.name, d.title[:50], score) - console.print(table) + console.print(table) - # Show ideas for this WG - ideas = [] - for d in drafts: - ideas.extend(db.get_ideas_for_draft(d.name)) - if ideas: - console.print(f"\n[bold]Ideas ({len(ideas)}):[/]") - for idea in ideas[:15]: - console.print(f" - [cyan]{idea['title']}[/]: {idea['description'][:80]}") - if len(ideas) > 15: - console.print(f" [dim]... and {len(ideas) - 15} more[/]") - finally: - db.close() + # Show ideas for this WG + ideas = [] + for d in drafts: + ideas.extend(db.get_ideas_for_draft(d.name)) + if ideas: + console.print(f"\n[bold]Ideas ({len(ideas)}):[/]") + for idea in ideas[:15]: + console.print(f" - [cyan]{idea['title']}[/]: {idea['description'][:80]}") + if len(ideas) > 15: + console.print(f" [dim]... and {len(ideas) - 15} more[/]") @wg.command("overlaps") @click.option("--min-wgs", default=2, help="Minimum WGs sharing a category to show") -def wg_overlaps(min_wgs: int): +@pass_cfg_db +def wg_overlaps(cfg, db, min_wgs: int): """Find categories and ideas that span multiple WGs — alignment opportunities.""" - cfg = _get_config() - db = Database(cfg) - try: - # Category spread across WGs - spread = db.category_wg_spread() - multi = [s for s in spread if s["wg_count"] >= min_wgs - and not all(w["wg"] == "none" for w in s["wgs"])] + # Category spread across WGs + spread = db.category_wg_spread() + multi = [s for s in spread if s["wg_count"] >= min_wgs + and not all(w["wg"] == "none" for w in s["wgs"])] - if multi: - console.print(f"\n[bold]Categories spanning {min_wgs}+ WGs[/]\n") - for s in multi: - wg_strs = [f"{w['wg']}({w['count']})" for w in s["wgs"] if w["wg"] != "none"] - if wg_strs: - console.print(f" [cyan]{s['category']}[/] — {s['total_drafts']} drafts across {s['wg_count']} WGs") - console.print(f" WGs: {', '.join(wg_strs)}") + if multi: + console.print(f"\n[bold]Categories spanning {min_wgs}+ WGs[/]\n") + for s in multi: + wg_strs = [f"{w['wg']}({w['count']})" for w in s["wgs"] if w["wg"] != "none"] + if wg_strs: + console.print(f" [cyan]{s['category']}[/] — {s['total_drafts']} drafts across {s['wg_count']} WGs") + console.print(f" WGs: {', '.join(wg_strs)}") - # Idea overlap across WGs - idea_overlaps = db.wg_idea_overlap() - cross_wg = [o for o in idea_overlaps - if not all(w == "none" for w in o["wg_names"])] + # Idea overlap across WGs + idea_overlaps = db.wg_idea_overlap() + cross_wg = [o for o in idea_overlaps + if not all(w == "none" for w in o["wg_names"])] - if cross_wg: - console.print(f"\n[bold]Ideas appearing in {min_wgs}+ WGs ({len(cross_wg)} found)[/]\n") - for o in cross_wg[:20]: - real_wgs = [w for w in o["wg_names"] if w != "none"] - console.print(f" [cyan]{o['idea_title']}[/] — WGs: {', '.join(real_wgs)}") - for entry in o["wgs"]: - if entry["wg"] != "none": - console.print(f" - [{entry['wg']}] {entry['draft_name']}") - if len(cross_wg) > 20: - console.print(f"\n [dim]... and {len(cross_wg) - 20} more[/]") + if cross_wg: + console.print(f"\n[bold]Ideas appearing in {min_wgs}+ WGs ({len(cross_wg)} found)[/]\n") + for o in cross_wg[:20]: + real_wgs = [w for w in o["wg_names"] if w != "none"] + console.print(f" [cyan]{o['idea_title']}[/] — WGs: {', '.join(real_wgs)}") + for entry in o["wgs"]: + if entry["wg"] != "none": + console.print(f" - [{entry['wg']}] {entry['draft_name']}") + if len(cross_wg) > 20: + console.print(f"\n [dim]... and {len(cross_wg) - 20} more[/]") - if not multi and not cross_wg: - console.print("[yellow]No cross-WG overlaps found.[/]") - finally: - db.close() + if not multi and not cross_wg: + console.print("[yellow]No cross-WG overlaps found.[/]") @wg.command("alignment") -def wg_alignment(): +@pass_cfg_db +def wg_alignment(cfg, db): """Identify where individual drafts should be consolidated into WG standards.""" - cfg = _get_config() - db = Database(cfg) - try: - # Compare individual vs WG category distribution - dist = db.individual_vs_wg_categories() - indiv = dist["individual"] - adopted = dist["wg_adopted"] + # Compare individual vs WG category distribution + dist = db.individual_vs_wg_categories() + indiv = dist["individual"] + adopted = dist["wg_adopted"] - console.print("\n[bold]Individual vs WG-Adopted Category Distribution[/]\n") + console.print("\n[bold]Individual vs WG-Adopted Category Distribution[/]\n") - table = Table() - table.add_column("Category", width=25) - table.add_column("Individual", justify="right", width=10) - table.add_column("WG-Adopted", justify="right", width=10) - table.add_column("Signal", width=40) + table = Table() + table.add_column("Category", width=25) + table.add_column("Individual", justify="right", width=10) + table.add_column("WG-Adopted", justify="right", width=10) + table.add_column("Signal", width=40) - all_cats = sorted(set(list(indiv.keys()) + list(adopted.keys()))) - for cat in all_cats: - i_count = indiv.get(cat, 0) - w_count = adopted.get(cat, 0) - signal = "" - if i_count >= 5 and w_count == 0: - signal = "[yellow]High individual activity, no WG — needs WG?[/]" - elif i_count >= 3 and w_count >= 1: - signal = "[green]WG exists, individual drafts could target it[/]" - elif w_count > i_count and i_count > 0: - signal = "[dim]WG leading, some individual work[/]" - table.add_row(cat, str(i_count), str(w_count), signal) + all_cats = sorted(set(list(indiv.keys()) + list(adopted.keys()))) + for cat in all_cats: + i_count = indiv.get(cat, 0) + w_count = adopted.get(cat, 0) + signal = "" + if i_count >= 5 and w_count == 0: + signal = "[yellow]High individual activity, no WG — needs WG?[/]" + elif i_count >= 3 and w_count >= 1: + signal = "[green]WG exists, individual drafts could target it[/]" + elif w_count > i_count and i_count > 0: + signal = "[dim]WG leading, some individual work[/]" + table.add_row(cat, str(i_count), str(w_count), signal) - console.print(table) + console.print(table) - # Find overlap clusters within individual submissions that might warrant a WG - console.print("\n[bold]Consolidation Candidates[/]") - console.print("[dim]Categories with many individual drafts but no WG adoption — " - "potential for new WG or BoF[/]\n") + # Find overlap clusters within individual submissions that might warrant a WG + console.print("\n[bold]Consolidation Candidates[/]") + console.print("[dim]Categories with many individual drafts but no WG adoption — " + "potential for new WG or BoF[/]\n") - candidates = [] - for cat in all_cats: - i_count = indiv.get(cat, 0) - w_count = adopted.get(cat, 0) - if i_count >= 5 and w_count == 0: - candidates.append((cat, i_count)) + candidates = [] + for cat in all_cats: + i_count = indiv.get(cat, 0) + w_count = adopted.get(cat, 0) + if i_count >= 5 and w_count == 0: + candidates.append((cat, i_count)) - if candidates: - for cat, count in sorted(candidates, key=lambda x: x[1], reverse=True): - console.print(f" [yellow]{cat}[/]: {count} individual drafts, no WG home") - # Show sample drafts - rows = db.conn.execute(""" - SELECT d.name, d.title FROM drafts d - JOIN ratings r ON d.name = r.draft_name - WHERE (d."group" = 'none' OR d."group" IS NULL) - AND r.categories LIKE ? - ORDER BY (r.novelty * 0.30 + r.relevance * 0.25 + r.maturity * 0.20 - + r.momentum * 0.15 + (6 - r.overlap) * 0.10) DESC - LIMIT 5 - """, (f"%{cat}%",)).fetchall() - for row in rows: - console.print(f" - {row['name']}: {row['title'][:60]}") - console.print() - else: - console.print(" [green]All active categories have WG representation.[/]") - finally: - db.close() + if candidates: + for cat, count in sorted(candidates, key=lambda x: x[1], reverse=True): + console.print(f" [yellow]{cat}[/]: {count} individual drafts, no WG home") + # Show sample drafts + rows = db.conn.execute(""" + SELECT d.name, d.title FROM drafts d + JOIN ratings r ON d.name = r.draft_name + WHERE (d."group" = 'none' OR d."group" IS NULL) + AND r.categories LIKE ? + ORDER BY (r.novelty * 0.30 + r.relevance * 0.25 + r.maturity * 0.20 + + r.momentum * 0.15 + (6 - r.overlap) * 0.10) DESC + LIMIT 5 + """, (f"%{cat}%",)).fetchall() + for row in rows: + console.print(f" - {row['name']}: {row['title'][:60]}") + console.print() + else: + console.print(" [green]All active categories have WG representation.[/]") @wg.command("targets") -def wg_targets(): +@pass_cfg_db +def wg_targets(cfg, db): """Suggest best WGs for submitting new work in each category.""" - cfg = _get_config() - db = Database(cfg) - try: - spread = db.category_wg_spread() - summaries = {s["wg"]: s for s in db.wg_summary()} + spread = db.category_wg_spread() + summaries = {s["wg"]: s for s in db.wg_summary()} - console.print("\n[bold]Recommended Submission Targets by Category[/]\n") + console.print("\n[bold]Recommended Submission Targets by Category[/]\n") - for s in spread: - cat = s["category"] - # Filter to real WGs (not 'none') - real_wgs = [w for w in s["wgs"] if w["wg"] != "none"] - if not real_wgs: - console.print(f" [cyan]{cat}[/]: [yellow]No active WG — individual submission[/]") - continue + for s in spread: + cat = s["category"] + # Filter to real WGs (not 'none') + real_wgs = [w for w in s["wgs"] if w["wg"] != "none"] + if not real_wgs: + console.print(f" [cyan]{cat}[/]: [yellow]No active WG — individual submission[/]") + continue - best = real_wgs[0] - wg_info = summaries.get(best["wg"], {}) - console.print( - f" [cyan]{cat}[/]: [bold green]{best['wg']}[/] " - f"({best['count']} drafts" - f"{', avg relevance ' + str(wg_info.get('avg_relevance', '?')) if wg_info else ''})" - ) - if len(real_wgs) > 1: - alts = ", ".join(f"{w['wg']}({w['count']})" for w in real_wgs[1:3]) - console.print(f" Also: {alts}") + best = real_wgs[0] + wg_info = summaries.get(best["wg"], {}) + console.print( + f" [cyan]{cat}[/]: [bold green]{best['wg']}[/] " + f"({best['count']} drafts" + f"{', avg relevance ' + str(wg_info.get('avg_relevance', '?')) if wg_info else ''})" + ) + if len(real_wgs) > 1: + alts = ", ".join(f"{w['wg']}({w['count']})" for w in real_wgs[1:3]) + console.print(f" Also: {alts}") - console.print() - finally: - db.close() + console.print() # ── visualize ──────────────────────────────────────────────────────────── @@ -999,188 +1066,124 @@ def viz(): @viz.command("all") -def viz_all(): +@pass_cfg_db +def viz_all(cfg, db): """Generate all available visualizations.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) v = Visualizer(cfg, db) - try: - paths = v.generate_all() - console.print(f"\n[bold green]{len(paths)} visualizations[/] saved to {v.output_dir}/") - finally: - db.close() + paths = v.generate_all() + console.print(f"\n[bold green]{len(paths)} visualizations[/] saved to {v.output_dir}/") @viz.command("landscape") @click.option("--method", "-m", default="tsne", type=click.Choice(["umap", "tsne"]), help="Dimensionality reduction method") -def viz_landscape(method: str): +@pass_cfg_db +def viz_landscape(cfg, db, method: str): """2D scatter of draft embeddings colored by category.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.landscape_scatter(method=method) - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).landscape_scatter(method=method) + console.print(f"Saved: [bold]{path}[/]") @viz.command("heatmap") -def viz_heatmap(): +@pass_cfg_db +def viz_heatmap(cfg, db): """Clustered similarity heatmap (PNG).""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.similarity_heatmap() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).similarity_heatmap() + console.print(f"Saved: [bold]{path}[/]") @viz.command("distributions") -def viz_distributions(): +@pass_cfg_db +def viz_distributions(cfg, db): """Rating dimension distributions by category (PNG).""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.score_distributions() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).score_distributions() + console.print(f"Saved: [bold]{path}[/]") @viz.command("timeline") -def viz_timeline(): +@pass_cfg_db +def viz_timeline(cfg, db): """Stacked area chart of monthly submissions.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.timeline_chart() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).timeline_chart() + console.print(f"Saved: [bold]{path}[/]") @viz.command("bubble") -def viz_bubble(): +@pass_cfg_db +def viz_bubble(cfg, db): """Interactive bubble chart: novelty vs maturity.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.bubble_explorer() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).bubble_explorer() + console.print(f"Saved: [bold]{path}[/]") @viz.command("radar") -def viz_radar(): +@pass_cfg_db +def viz_radar(cfg, db): """Radar chart of average category rating profiles.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.category_radar() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).category_radar() + console.print(f"Saved: [bold]{path}[/]") @viz.command("network") @click.option("--min-shared", "-n", default=2, help="Minimum shared drafts for an edge") -def viz_network(min_shared: int): +@pass_cfg_db +def viz_network(cfg, db, min_shared: int): """Interactive author collaboration network graph.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.author_network(min_shared=min_shared) - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).author_network(min_shared=min_shared) + console.print(f"Saved: [bold]{path}[/]") @viz.command("treemap") -def viz_treemap(): +@pass_cfg_db +def viz_treemap(cfg, db): """Category treemap colored by average score.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.category_treemap() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).category_treemap() + console.print(f"Saved: [bold]{path}[/]") @viz.command("quality") -def viz_quality(): +@pass_cfg_db +def viz_quality(cfg, db): """Score vs uniqueness scatter (quality vs redundancy).""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.score_vs_overlap() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).score_vs_overlap() + console.print(f"Saved: [bold]{path}[/]") @viz.command("orgs") -def viz_orgs(): +@pass_cfg_db +def viz_orgs(cfg, db): """Organization contribution bar chart.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.org_contributions() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).org_contributions() + console.print(f"Saved: [bold]{path}[/]") @viz.command("ideas") -def viz_ideas(): +@pass_cfg_db +def viz_ideas(cfg, db): """Ideas frequency chart by type.""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.ideas_chart() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).ideas_chart() + console.print(f"Saved: [bold]{path}[/]") @viz.command("browser") -def viz_browser(): +@pass_cfg_db +def viz_browser(cfg, db): """Interactive filterable draft browser (standalone HTML).""" from .visualize import Visualizer - cfg = _get_config() - db = Database(cfg) - v = Visualizer(cfg, db) - try: - path = v.draft_browser() - console.print(f"Saved: [bold]{path}[/]") - finally: - db.close() + path = Visualizer(cfg, db).draft_browser() + console.print(f"Saved: [bold]{path}[/]") # ── authors ───────────────────────────────────────────────────────────── @@ -1190,77 +1193,69 @@ def viz_browser(): @click.argument("name", required=False) @click.option("--fetch/--no-fetch", default=False, help="Fetch author data from Datatracker first") @click.option("--limit", "-n", default=20, help="Number of top authors to show") -def authors(name: str | None, fetch: bool, limit: int): +@pass_cfg_db +def authors(cfg, db, name: str | None, fetch: bool, limit: int): """Show authors for a draft, or top authors overall.""" from .authors import AuthorNetwork - cfg = _get_config() - db = Database(cfg) - network = AuthorNetwork(cfg, db) + author_network = AuthorNetwork(cfg, db) - try: - if fetch: - count = network.fetch_all_authors() - console.print(f"Fetched authors for [bold green]{count}[/] drafts") + if fetch: + count = author_network.fetch_all_authors() + console.print(f"Fetched authors for [bold green]{count}[/] drafts") - if name: - draft_authors = db.get_authors_for_draft(name) - if not draft_authors: - console.print(f"[yellow]No author data for {name}. Run `ietf authors --fetch` first.[/]") - return - console.print(f"\n[bold]Authors of {name}:[/]") - for a in draft_authors: - console.print(f" - {a.name} ({a.affiliation or 'no affiliation'})") - else: - top = db.top_authors(limit=limit) - if not top: - console.print("[yellow]No author data. Run `ietf authors --fetch` first.[/]") - return - table = Table(title=f"Top {limit} Authors") - table.add_column("#", justify="right", width=4) - table.add_column("Author", style="cyan") - table.add_column("Organization") - table.add_column("Drafts", justify="right", width=6) - for rank, (aname, aff, cnt, _) in enumerate(top, 1): - table.add_row(str(rank), aname, aff, str(cnt)) - console.print(table) - finally: - db.close() + if name: + draft_authors = db.get_authors_for_draft(name) + if not draft_authors: + console.print(f"[yellow]No author data for {name}. Run `ietf authors --fetch` first.[/]") + return + console.print(f"\n[bold]Authors of {name}:[/]") + for a in draft_authors: + console.print(f" - {a.name} ({a.affiliation or 'no affiliation'})") + else: + top = db.top_authors(limit=limit) + if not top: + console.print("[yellow]No author data. Run `ietf authors --fetch` first.[/]") + return + table = Table(title=f"Top {limit} Authors") + table.add_column("#", justify="right", width=4) + table.add_column("Author", style="cyan") + table.add_column("Organization") + table.add_column("Drafts", justify="right", width=6) + for rank, (aname, aff, cnt, _) in enumerate(top, 1): + table.add_row(str(rank), aname, aff, str(cnt)) + console.print(table) @main.command() @click.option("--top", "-n", default=20, help="Top N to show") -def network(top: int): +@pass_cfg_db +def network(cfg, db, top: int): """Show author collaboration network.""" - cfg = _get_config() - db = Database(cfg) - try: - console.print("\n[bold]Top Organizations[/]") - orgs = db.top_orgs(limit=top) - if orgs: - table = Table() - table.add_column("#", justify="right", width=4) - table.add_column("Organization", style="cyan") - table.add_column("Authors", justify="right", width=8) - table.add_column("Drafts", justify="right", width=6) - for rank, (org, auth_cnt, draft_cnt) in enumerate(orgs, 1): - table.add_row(str(rank), org, str(auth_cnt), str(draft_cnt)) - console.print(table) + console.print("\n[bold]Top Organizations[/]") + orgs = db.top_orgs(limit=top) + if orgs: + table = Table() + table.add_column("#", justify="right", width=4) + table.add_column("Organization", style="cyan") + table.add_column("Authors", justify="right", width=8) + table.add_column("Drafts", justify="right", width=6) + for rank, (org, auth_cnt, draft_cnt) in enumerate(orgs, 1): + table.add_row(str(rank), org, str(auth_cnt), str(draft_cnt)) + console.print(table) - console.print("\n[bold]Cross-Org Collaboration[/]") - cross = db.cross_org_collaborations(limit=top) - if cross: - table = Table() - table.add_column("Org A", style="cyan") - table.add_column("Org B", style="cyan") - table.add_column("Shared Drafts", justify="right", width=8) - for org_a, org_b, shared in cross: - table.add_row(org_a, org_b, str(shared)) - console.print(table) - else: - console.print("[yellow]No author data. Run `ietf authors --fetch` first.[/]") - finally: - db.close() + console.print("\n[bold]Cross-Org Collaboration[/]") + cross = db.cross_org_collaborations(limit=top) + if cross: + table = Table() + table.add_column("Org A", style="cyan") + table.add_column("Org B", style="cyan") + table.add_column("Shared Drafts", justify="right", width=8) + for org_a, org_b, shared in cross: + table.add_row(org_a, org_b, str(shared)) + console.print(table) + else: + console.print("[yellow]No author data. Run `ietf authors --fetch` first.[/]") # ── ideas ─────────────────────────────────────────────────────────────── @@ -1274,158 +1269,169 @@ def network(top: int): @click.option("--cheap/--quality", default=True, help="Use Haiku (cheap) vs Sonnet (quality)") @click.option("--reextract", is_flag=True, help="Clear existing ideas and re-extract with current prompt") @click.option("--draft", "reextract_draft", default=None, help="Specific draft to re-extract (with --reextract)") +@click.option("--dry-run", is_flag=True, help="Show what would be extracted without making changes") @click.pass_context def ideas(ctx, name: str | None, extract_all: bool, limit: int, batch: int, cheap: bool, - reextract: bool, reextract_draft: str | None): + reextract: bool, reextract_draft: str | None, dry_run: bool): """Extract, score, and filter technical ideas from drafts.""" if ctx.invoked_subcommand is not None: return - from .analyzer import Analyzer + cfg = ctx.obj["cfg"] + db = ctx.obj["db"] - cfg = _get_config() - db = Database(cfg) + if dry_run: + if reextract: + existing = db.idea_count() + if reextract_draft: + ideas_for = db.get_ideas_for_draft(reextract_draft) + console.print(f"[bold yellow]DRY RUN[/]: Would clear [bold]{len(ideas_for)}[/] ideas for {reextract_draft} and re-extract") + else: + console.print(f"[bold yellow]DRY RUN[/]: Would clear all [bold]{existing}[/] ideas and re-extract from up to {limit} drafts") + elif extract_all: + missing = db.drafts_without_ideas(limit=limit) + console.print(f"[bold yellow]DRY RUN[/]: Would extract ideas from [bold]{len(missing)}[/] drafts (batch={batch}, {'cheap' if cheap else 'quality'})") + for d in missing[:20]: + console.print(f" - {d}") + if len(missing) > 20: + console.print(f" ... and {len(missing) - 20} more") + elif name: + existing = db.get_ideas_for_draft(name) + console.print(f"[bold yellow]DRY RUN[/]: Would extract ideas from {name} (currently has {len(existing)} ideas)") + else: + console.print("Use --name DRAFT, --all, or a subcommand: ideas score / ideas filter") + return + + from .analyzer import Analyzer analyzer = Analyzer(cfg, db) - try: - if reextract: - # Clear existing ideas, then re-extract - deleted = db.delete_ideas(draft_name=reextract_draft) - if reextract_draft: - console.print(f"Cleared [bold]{deleted}[/] ideas for {reextract_draft}") - idea_list = analyzer.extract_ideas(reextract_draft, use_cache=True) - if idea_list: - console.print(f"Re-extracted [bold green]{len(idea_list)}[/] ideas:") - for idea in idea_list: - console.print(f" [{idea.get('type', '?')}] [bold]{idea['title']}[/]") - console.print(f" {idea['description']}\n") - else: - console.print("[red]Re-extraction failed or no ideas found[/]") - else: - console.print(f"Cleared [bold]{deleted}[/] ideas from all drafts") - count = analyzer.extract_all_ideas(limit=limit, batch_size=batch, cheap=cheap) - console.print(f"Re-extracted ideas from [bold green]{count}[/] drafts") - elif extract_all: - count = analyzer.extract_all_ideas(limit=limit, batch_size=batch, cheap=cheap) - console.print(f"Extracted ideas from [bold green]{count}[/] drafts") - elif name: - idea_list = analyzer.extract_ideas(name) + if reextract: + # Clear existing ideas, then re-extract + deleted = db.delete_ideas(draft_name=reextract_draft) + if reextract_draft: + console.print(f"Cleared [bold]{deleted}[/] ideas for {reextract_draft}") + idea_list = analyzer.extract_ideas(reextract_draft, use_cache=True) if idea_list: - console.print(f"\n[bold]Ideas from {name}:[/]\n") + console.print(f"Re-extracted [bold green]{len(idea_list)}[/] ideas:") for idea in idea_list: console.print(f" [{idea.get('type', '?')}] [bold]{idea['title']}[/]") console.print(f" {idea['description']}\n") else: - console.print("[red]Extraction failed or no ideas found[/]") + console.print("[red]Re-extraction failed or no ideas found[/]") else: - console.print("Use --name DRAFT, --all, or a subcommand: ideas score / ideas filter") - finally: - db.close() + console.print(f"Cleared [bold]{deleted}[/] ideas from all drafts") + count = analyzer.extract_all_ideas(limit=limit, batch_size=batch, cheap=cheap) + console.print(f"Re-extracted ideas from [bold green]{count}[/] drafts") + elif extract_all: + count = analyzer.extract_all_ideas(limit=limit, batch_size=batch, cheap=cheap) + console.print(f"Extracted ideas from [bold green]{count}[/] drafts") + elif name: + idea_list = analyzer.extract_ideas(name) + if idea_list: + console.print(f"\n[bold]Ideas from {name}:[/]\n") + for idea in idea_list: + console.print(f" [{idea.get('type', '?')}] [bold]{idea['title']}[/]") + console.print(f" {idea['description']}\n") + else: + console.print("[red]Extraction failed or no ideas found[/]") + else: + console.print("Use --name DRAFT, --all, or a subcommand: ideas score / ideas filter") @ideas.command("score") @click.option("--cheap/--quality", default=True, help="Use Haiku (cheap) vs Sonnet (quality)") @click.option("--batch", "-b", default=20, help="Ideas per API call (default 20)") -def ideas_score(cheap: bool, batch: int): +@pass_cfg_db +def ideas_score(cfg, db, cheap: bool, batch: int): """Score ideas for novelty (1=generic, 5=genuinely novel).""" from .analyzer import Analyzer - cfg = _get_config() - db = Database(cfg) analyzer = Analyzer(cfg, db) + stats = analyzer.score_idea_novelty(batch_size=batch, cheap=cheap) - try: - stats = analyzer.score_idea_novelty(batch_size=batch, cheap=cheap) + if stats["scored_count"] == 0: + return - if stats["scored_count"] == 0: - return + # Show distribution table + dist = db.idea_score_distribution() + table = Table(title="Novelty Score Distribution") + table.add_column("Score", style="bold", justify="center") + table.add_column("Label", style="dim") + table.add_column("Count", justify="right") + table.add_column("Bar", min_width=30) - # Show distribution table - dist = db.idea_score_distribution() - table = Table(title="Novelty Score Distribution") - table.add_column("Score", style="bold", justify="center") - table.add_column("Label", style="dim") - table.add_column("Count", justify="right") - table.add_column("Bar", min_width=30) + labels = { + 1: "Generic building block", + 2: "Obvious extension", + 3: "Useful but expected", + 4: "Interesting contribution", + 5: "Genuinely novel", + } + max_count = max(dist.values()) if dist else 1 + for score in range(1, 6): + count = dist.get(score, 0) + bar_len = int(30 * count / max_count) if max_count > 0 else 0 + table.add_row( + str(score), labels[score], str(count), + "[green]" + "#" * bar_len + "[/]" + ) - labels = { - 1: "Generic building block", - 2: "Obvious extension", - 3: "Useful but expected", - 4: "Interesting contribution", - 5: "Genuinely novel", - } - max_count = max(dist.values()) if dist else 1 - for score in range(1, 6): - count = dist.get(score, 0) - bar_len = int(30 * count / max_count) if max_count > 0 else 0 - table.add_row( - str(score), labels[score], str(count), - "[green]" + "#" * bar_len + "[/]" - ) - - total = sum(dist.values()) - unscored = db.idea_count() - total - console.print(table) - console.print(f"\nTotal scored: [bold]{total}[/] | Unscored: {unscored} | Avg: [bold]{stats['avg_score']:.1f}[/]") - finally: - db.close() + total = sum(dist.values()) + unscored = db.idea_count() - total + console.print(table) + console.print(f"\nTotal scored: [bold]{total}[/] | Unscored: {unscored} | Avg: [bold]{stats['avg_score']:.1f}[/]") @ideas.command("filter") @click.option("--min-score", "-m", default=2, help="Remove ideas below this score (default 2)") @click.option("--dry-run/--execute", default=True, help="Preview (default) or actually delete") -def ideas_filter(min_score: int, dry_run: bool): +@pass_cfg_db +def ideas_filter(cfg, db, min_score: int, dry_run: bool): """Filter out low-novelty ideas by score threshold.""" - cfg = _get_config() - db = Database(cfg) + candidates = db.ideas_below_score(min_score) + if not candidates: + console.print(f"No ideas with novelty_score < {min_score}.") + return - try: - candidates = db.ideas_below_score(min_score) - if not candidates: - console.print(f"No ideas with novelty_score < {min_score}.") - return + # Show what would be removed + table = Table( + title=f"Ideas with novelty_score < {min_score} " + f"({'DRY RUN' if dry_run else 'WILL DELETE'})" + ) + table.add_column("Score", style="bold", justify="center") + table.add_column("Idea", style="cyan", max_width=40) + table.add_column("Draft", max_width=50) + table.add_column("Description", max_width=60) - # Show what would be removed - table = Table( - title=f"Ideas with novelty_score < {min_score} " - f"({'DRY RUN' if dry_run else 'WILL DELETE'})" + for idea in candidates[:50]: # Show first 50 + table.add_row( + str(idea["novelty_score"]), + idea["title"], + idea["draft_title"], + idea["description"][:60] + ("..." if len(idea["description"]) > 60 else ""), ) - table.add_column("Score", style="bold", justify="center") - table.add_column("Idea", style="cyan", max_width=40) - table.add_column("Draft", max_width=50) - table.add_column("Description", max_width=60) - for idea in candidates[:50]: # Show first 50 - table.add_row( - str(idea["novelty_score"]), - idea["title"], - idea["draft_title"], - idea["description"][:60] + ("..." if len(idea["description"]) > 60 else ""), - ) + console.print(table) - console.print(table) + if len(candidates) > 50: + console.print(f" ... and {len(candidates) - 50} more") - if len(candidates) > 50: - console.print(f" ... and {len(candidates) - 50} more") + console.print(f"\nTotal to remove: [bold red]{len(candidates)}[/] / {db.idea_count()} ideas") - console.print(f"\nTotal to remove: [bold red]{len(candidates)}[/] / {db.idea_count()} ideas") - - if not dry_run: - deleted = db.delete_low_score_ideas(min_score) - console.print(f"[bold red]Deleted {deleted} low-novelty ideas.[/]") - console.print(f"Remaining ideas: [bold green]{db.idea_count()}[/]") - else: - console.print("[dim]Use --execute to actually delete.[/]") - finally: - db.close() + if not dry_run: + deleted = db.delete_low_score_ideas(min_score) + console.print(f"[bold red]Deleted {deleted} low-novelty ideas.[/]") + console.print(f"Remaining ideas: [bold green]{db.idea_count()}[/]") + else: + console.print("[dim]Use --execute to actually delete.[/]") @ideas.command("convergence") @click.option("--threshold", "-t", default=0.75, help="SequenceMatcher ratio threshold (0-1)") @click.option("--limit", "-n", default=50, help="Max results to show") @click.option("--list-all", is_flag=True, help="List all convergent idea pairs") -def ideas_convergence(threshold: float, limit: int, list_all: bool): +@pass_cfg_db +def ideas_convergence(cfg, db, threshold: float, limit: int, list_all: bool): """Find cross-org convergent ideas using SequenceMatcher fuzzy matching. Groups ideas by fuzzy title similarity, then filters to ideas where @@ -1435,87 +1441,80 @@ def ideas_convergence(threshold: float, limit: int, list_all: bool): from difflib import SequenceMatcher from .orgs import normalize_org - cfg = _get_config() - db = Database(cfg) + all_ideas = db.all_ideas() + if not all_ideas: + console.print("[yellow]No ideas extracted yet. Run `ietf ideas --all` first.[/]") + return - try: - all_ideas = db.all_ideas() - if not all_ideas: - console.print("[yellow]No ideas extracted yet. Run `ietf ideas --all` first.[/]") - return + # Build draft -> org mapping + draft_orgs: dict[str, set[str]] = defaultdict(set) + rows = db.conn.execute( + """SELECT da.draft_name, a.affiliation + FROM draft_authors da + JOIN authors a ON da.person_id = a.person_id + WHERE a.affiliation != ''""" + ).fetchall() + for r in rows: + org = normalize_org(r["affiliation"]) + if org and org != "Independent": + draft_orgs[r["draft_name"]].add(org) - # Build draft -> org mapping - draft_orgs: dict[str, set[str]] = defaultdict(set) - rows = db.conn.execute( - """SELECT da.draft_name, a.affiliation - FROM draft_authors da - JOIN authors a ON da.person_id = a.person_id - WHERE a.affiliation != ''""" - ).fetchall() - for r in rows: - org = normalize_org(r["affiliation"]) - if org and org != "Independent": - draft_orgs[r["draft_name"]].add(org) + # Group similar ideas by fuzzy title matching + idea_groups: list[dict] = [] + for idea in all_ideas: + title_lower = idea["title"].lower().strip() + matched = False + for group in idea_groups: + ratio = SequenceMatcher(None, title_lower, group["canonical"]).ratio() + if ratio >= threshold: + group["ideas"].append(idea) + group["drafts"].add(idea["draft_name"]) + group["orgs"].update(draft_orgs.get(idea["draft_name"], set())) + matched = True + break + if not matched: + idea_groups.append({ + "canonical": title_lower, + "title": idea["title"], + "ideas": [idea], + "drafts": {idea["draft_name"]}, + "orgs": set(draft_orgs.get(idea["draft_name"], set())), + }) - # Group similar ideas by fuzzy title matching - idea_groups: list[dict] = [] - for idea in all_ideas: - title_lower = idea["title"].lower().strip() - matched = False - for group in idea_groups: - ratio = SequenceMatcher(None, title_lower, group["canonical"]).ratio() - if ratio >= threshold: - group["ideas"].append(idea) - group["drafts"].add(idea["draft_name"]) - group["orgs"].update(draft_orgs.get(idea["draft_name"], set())) - matched = True - break - if not matched: - idea_groups.append({ - "canonical": title_lower, - "title": idea["title"], - "ideas": [idea], - "drafts": {idea["draft_name"]}, - "orgs": set(draft_orgs.get(idea["draft_name"], set())), - }) + # Filter to cross-org ideas (2+ orgs) + cross_org = [g for g in idea_groups if len(g["orgs"]) >= 2] + cross_org.sort(key=lambda g: (-len(g["orgs"]), -len(g["drafts"]))) - # Filter to cross-org ideas (2+ orgs) - cross_org = [g for g in idea_groups if len(g["orgs"]) >= 2] - cross_org.sort(key=lambda g: (-len(g["orgs"]), -len(g["drafts"]))) + console.print(f"\n[bold]Cross-Organization Idea Convergence[/]") + console.print(f"Threshold: {threshold} | {len(all_ideas)} ideas | " + f"{len(idea_groups)} unique clusters | " + f"[bold green]{len(cross_org)}[/] cross-org convergent\n") - console.print(f"\n[bold]Cross-Organization Idea Convergence[/]") - console.print(f"Threshold: {threshold} | {len(all_ideas)} ideas | " - f"{len(idea_groups)} unique clusters | " - f"[bold green]{len(cross_org)}[/] cross-org convergent\n") + if not cross_org: + console.print("[yellow]No cross-org convergent ideas at this threshold.[/]") + return - if not cross_org: - console.print("[yellow]No cross-org convergent ideas at this threshold.[/]") - return + show_n = len(cross_org) if list_all else min(limit, len(cross_org)) + table = Table(title=f"Cross-Org Convergent Ideas (showing {show_n} of {len(cross_org)})") + table.add_column("#", justify="right", width=4) + table.add_column("Idea", style="bold", max_width=40) + table.add_column("Orgs", justify="right", width=5) + table.add_column("Drafts", justify="right", width=6) + table.add_column("Organizations", max_width=50) - show_n = len(cross_org) if list_all else min(limit, len(cross_org)) - table = Table(title=f"Cross-Org Convergent Ideas (showing {show_n} of {len(cross_org)})") - table.add_column("#", justify="right", width=4) - table.add_column("Idea", style="bold", max_width=40) - table.add_column("Orgs", justify="right", width=5) - table.add_column("Drafts", justify="right", width=6) - table.add_column("Organizations", max_width=50) + for rank, g in enumerate(cross_org[:show_n], 1): + org_list = ", ".join(sorted(g["orgs"])[:5]) + if len(g["orgs"]) > 5: + org_list += f" +{len(g['orgs']) - 5}" + table.add_row( + str(rank), g["title"][:40], str(len(g["orgs"])), + str(len(g["drafts"])), org_list, + ) - for rank, g in enumerate(cross_org[:show_n], 1): - org_list = ", ".join(sorted(g["orgs"])[:5]) - if len(g["orgs"]) > 5: - org_list += f" +{len(g['orgs']) - 5}" - table.add_row( - str(rank), g["title"][:40], str(len(g["orgs"])), - str(len(g["drafts"])), org_list, - ) - - console.print(table) - console.print(f"\n[bold]Summary[/]: {len(cross_org)} cross-org convergent ideas " - f"out of {len(idea_groups)} unique clusters " - f"({100 * len(cross_org) / len(idea_groups):.0f}%)") - - finally: - db.close() + console.print(table) + console.print(f"\n[bold]Summary[/]: {len(cross_org)} cross-org convergent ideas " + f"out of {len(idea_groups)} unique clusters " + f"({100 * len(cross_org) / len(idea_groups):.0f}%)") # ── dedup-ideas ───────────────────────────────────────────────────────── @@ -1528,90 +1527,92 @@ def ideas_convergence(threshold: float, limit: int, list_all: bool): help="Preview merges (default) vs actually delete duplicates") @click.option("--draft", "draft_name", default=None, help="Limit to a single draft name") -def dedup_ideas(threshold: float, dry_run: bool, draft_name: str | None): +@pass_cfg_db +def dedup_ideas(cfg, db, threshold: float, dry_run: bool, draft_name: str | None): """Deduplicate similar ideas within each draft using embedding similarity.""" from .analyzer import Analyzer - cfg = _get_config() - db = Database(cfg) analyzer = Analyzer(cfg, db) + mode = "[bold yellow]DRY RUN[/]" if dry_run else "[bold red]EXECUTE[/]" + console.print(f"\n{mode} — Deduplicating ideas (threshold={threshold})") + if draft_name: + console.print(f"Limiting to draft: [bold]{draft_name}[/]") + console.print() - try: - mode = "[bold yellow]DRY RUN[/]" if dry_run else "[bold red]EXECUTE[/]" - console.print(f"\n{mode} — Deduplicating ideas (threshold={threshold})") - if draft_name: - console.print(f"Limiting to draft: [bold]{draft_name}[/]") + result = analyzer.dedup_ideas( + threshold=threshold, dry_run=dry_run, draft_name=draft_name + ) + + if result["examples"]: + table = Table(title="Merge Candidates" if dry_run else "Merged Ideas") + table.add_column("Draft", style="dim", max_width=40) + table.add_column("Keep", style="green") + table.add_column("Drop", style="red") + table.add_column("Similarity", justify="right") + + for ex in result["examples"]: + table.add_row( + ex["draft"].split("/")[-1][:40], + ex["keep"], + ex["drop"], + f"{ex['similarity']:.3f}", + ) + console.print(table) console.print() - result = analyzer.dedup_ideas( - threshold=threshold, dry_run=dry_run, draft_name=draft_name - ) + action = "Would remove" if dry_run else "Removed" + console.print( + f"Ideas before: [bold]{result['total_before']}[/] | " + f"{action}: [bold]{result['merged_count']}[/] | " + f"After: [bold]{result['total_after']}[/]" + ) - if result["examples"]: - table = Table(title="Merge Candidates" if dry_run else "Merged Ideas") - table.add_column("Draft", style="dim", max_width=40) - table.add_column("Keep", style="green") - table.add_column("Drop", style="red") - table.add_column("Similarity", justify="right") - - for ex in result["examples"]: - table.add_row( - ex["draft"].split("/")[-1][:40], - ex["keep"], - ex["drop"], - f"{ex['similarity']:.3f}", - ) - console.print(table) - console.print() - - action = "Would remove" if dry_run else "Removed" + if dry_run and result["merged_count"] > 0: console.print( - f"Ideas before: [bold]{result['total_before']}[/] | " - f"{action}: [bold]{result['merged_count']}[/] | " - f"After: [bold]{result['total_after']}[/]" + "\n[dim]Run with --execute to apply these merges.[/]" ) - if dry_run and result["merged_count"] > 0: - console.print( - "\n[dim]Run with --execute to apply these merges.[/]" - ) - finally: - db.close() - # ── gaps ──────────────────────────────────────────────────────────────── @main.command() @click.option("--refresh", is_flag=True, help="Re-run gap analysis even if cached") -def gaps(refresh: bool): +@click.option("--dry-run", is_flag=True, help="Show existing gaps without running analysis") +@pass_cfg_db +def gaps(cfg, db, refresh: bool, dry_run: bool): """Identify gaps in the current draft landscape using Claude.""" - from .analyzer import Analyzer from .reports import Reporter - cfg = _get_config() - db = Database(cfg) + if dry_run: + existing = db.all_gaps() + console.print(f"[bold yellow]DRY RUN[/]: {len(existing)} gaps currently identified") + if refresh: + console.print(" Would re-run gap analysis via Claude API") + for i, gap in enumerate(existing if existing and isinstance(existing[0], dict) else [], 1): + sev = gap.get("severity", "medium").upper() + console.print(f" [bold]{i}. {gap['topic']}[/] [{sev}]") + return + + from .analyzer import Analyzer analyzer = Analyzer(cfg, db) reporter = Reporter(cfg, db) - try: - existing = db.all_gaps() - if existing and not refresh: - console.print(f"[bold]{len(existing)} gaps[/] already identified (use --refresh to re-run)\n") - else: - gap_list = analyzer.gap_analysis() - console.print(f"\nIdentified [bold green]{len(gap_list)}[/] gaps\n") - existing = gap_list + existing = db.all_gaps() + if existing and not refresh: + console.print(f"[bold]{len(existing)} gaps[/] already identified (use --refresh to re-run)\n") + else: + gap_list = analyzer.gap_analysis() + console.print(f"\nIdentified [bold green]{len(gap_list)}[/] gaps\n") + existing = gap_list - for i, gap in enumerate(existing if isinstance(existing[0], dict) else [], 1): - sev = gap.get("severity", "medium").upper() - console.print(f" [bold]{i}. {gap['topic']}[/] [{sev}]") - console.print(f" {gap['description'][:100]}\n") + for i, gap in enumerate(existing if isinstance(existing[0], dict) else [], 1): + sev = gap.get("severity", "medium").upper() + console.print(f" [bold]{i}. {gap['topic']}[/] [{sev}]") + console.print(f" {gap['description'][:100]}\n") - path = reporter.gaps_report() - console.print(f"Report saved: [bold]{path}[/]") - finally: - db.close() + path = reporter.gaps_report() + console.print(f"Report saved: [bold]{path}[/]") # ── refs ──────────────────────────────────────────────────────────────── @@ -1623,69 +1624,64 @@ def gaps(refresh: bool): @click.option("--top", "-n", default=30, help="Number of top-referenced items to show") @click.option("--type", "ref_type", default="rfc", type=click.Choice(["rfc", "draft", "bcp"]), help="Reference type to show top results for") -def refs(name: str | None, extract: bool, top: int, ref_type: str): +@pass_cfg_db +def refs(cfg, db, name: str | None, extract: bool, top: int, ref_type: str): """Parse and show cross-references (RFCs, drafts, BCPs) in draft texts.""" import re - cfg = _get_config() - db = Database(cfg) - - try: - if extract: - missing = db.drafts_without_refs() - if not missing: - console.print("[green]All drafts with text already have refs extracted.[/]") - else: - console.print(f"Extracting refs from [bold]{len(missing)}[/] drafts...") - extracted = 0 - for draft_name in missing: - draft = db.get_draft(draft_name) - if not draft or not draft.full_text: - continue - found_refs = _extract_refs(draft.full_text, draft.name) - if found_refs: - db.insert_refs(draft_name, found_refs) - extracted += 1 - console.print(f"Extracted refs from [bold green]{extracted}[/] drafts") - - if name: - # Show refs for a specific draft - draft_refs = db.get_refs_for_draft(name) - if not draft_refs: - console.print(f"[yellow]No refs found for {name}. Run `ietf refs --extract` first.[/]") - return - table = Table(title=f"References in {name}") - table.add_column("Type", style="dim", width=6) - table.add_column("Reference", style="cyan") - for rt, rid in sorted(draft_refs): - table.add_row(rt.upper(), rid) - console.print(table) + if extract: + missing = db.drafts_without_refs() + if not missing: + console.print("[green]All drafts with text already have refs extracted.[/]") else: - # Show top-referenced items - stats = db.ref_stats() - if stats["total_refs"] == 0: - console.print("[yellow]No refs extracted yet. Run `ietf refs --extract` first.[/]") - return + console.print(f"Extracting refs from [bold]{len(missing)}[/] drafts...") + extracted = 0 + for draft_name in missing: + draft = db.get_draft(draft_name) + if not draft or not draft.full_text: + continue + found_refs = _extract_refs(draft.full_text, draft.name) + if found_refs: + db.insert_refs(draft_name, found_refs) + extracted += 1 + console.print(f"Extracted refs from [bold green]{extracted}[/] drafts") - console.print(f"\n[bold]Reference Stats[/]: {stats['drafts_with_refs']} drafts, " - f"{stats['total_refs']} total refs " - f"({stats['rfc_refs']} RFC, {stats['draft_refs']} draft, {stats['bcp_refs']} BCP)\n") + if name: + # Show refs for a specific draft + draft_refs = db.get_refs_for_draft(name) + if not draft_refs: + console.print(f"[yellow]No refs found for {name}. Run `ietf refs --extract` first.[/]") + return + table = Table(title=f"References in {name}") + table.add_column("Type", style="dim", width=6) + table.add_column("Reference", style="cyan") + for rt, rid in sorted(draft_refs): + table.add_row(rt.upper(), rid) + console.print(table) + else: + # Show top-referenced items + stats = db.ref_stats() + if stats["total_refs"] == 0: + console.print("[yellow]No refs extracted yet. Run `ietf refs --extract` first.[/]") + return - top_items = db.top_referenced(ref_type=ref_type, limit=top) - table = Table(title=f"Top {len(top_items)} Most-Referenced {ref_type.upper()}s") - table.add_column("#", justify="right", width=4) - table.add_column("Reference", style="cyan", width=30) - table.add_column("Count", justify="right", width=6) - table.add_column("Referenced By", max_width=60) - for rank, (rid, cnt, drafts) in enumerate(top_items, 1): - label = f"RFC {rid}" if ref_type == "rfc" else rid - draft_list = ", ".join(d.replace("draft-", "")[:25] for d in drafts[:4]) - if len(drafts) > 4: - draft_list += f" +{len(drafts) - 4}" - table.add_row(str(rank), label, str(cnt), draft_list) - console.print(table) - finally: - db.close() + console.print(f"\n[bold]Reference Stats[/]: {stats['drafts_with_refs']} drafts, " + f"{stats['total_refs']} total refs " + f"({stats['rfc_refs']} RFC, {stats['draft_refs']} draft, {stats['bcp_refs']} BCP)\n") + + top_items = db.top_referenced(ref_type=ref_type, limit=top) + table = Table(title=f"Top {len(top_items)} Most-Referenced {ref_type.upper()}s") + table.add_column("#", justify="right", width=4) + table.add_column("Reference", style="cyan", width=30) + table.add_column("Count", justify="right", width=6) + table.add_column("Referenced By", max_width=60) + for rank, (rid, cnt, drafts) in enumerate(top_items, 1): + label = f"RFC {rid}" if ref_type == "rfc" else rid + draft_list = ", ".join(d.replace("draft-", "")[:25] for d in drafts[:4]) + if len(drafts) > 4: + draft_list += f" +{len(drafts) - 4}" + table.add_row(str(rank), label, str(cnt), draft_list) + console.print(table) def _extract_refs(text: str, self_name: str) -> list[tuple[str, str]]: @@ -1720,123 +1716,117 @@ def _extract_refs(text: str, self_name: str) -> list[tuple[str, str]]: @main.command() @click.option("--category", "-c", help="Filter to a specific category") @click.option("--json-out", is_flag=True, help="Also output JSON for visualization") -def trends(category: str | None, json_out: bool): +@pass_cfg_db +def trends(cfg, db, category: str | None, json_out: bool): """Show category trend analysis — monthly breakdown with growth rates.""" import json as json_mod from collections import defaultdict - cfg = _get_config() - db = Database(cfg) + pairs = db.drafts_with_ratings(limit=500) + all_drafts = db.list_drafts(limit=500, order_by="time ASC") - try: - pairs = db.drafts_with_ratings(limit=500) - all_drafts = db.list_drafts(limit=500, order_by="time ASC") + if not pairs: + console.print("[yellow]No rated drafts. Run `ietf analyze --all` first.[/]") + return - if not pairs: - console.print("[yellow]No rated drafts. Run `ietf analyze --all` first.[/]") - return + # Build rating lookup + rating_map = {draft.name: rating for draft, rating in pairs} - # Build rating lookup - rating_map = {draft.name: rating for draft, rating in pairs} + # Collect monthly counts per category + monthly: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) + all_cats: set[str] = set() + for d in all_drafts: + month = d.time[:7] if d.time else "unknown" + r = rating_map.get(d.name) + if r: + for c in r.categories: + if category and c.lower() != category.lower(): + continue + monthly[month][c] += 1 + all_cats.add(c) - # Collect monthly counts per category - monthly: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) - all_cats: set[str] = set() - for d in all_drafts: - month = d.time[:7] if d.time else "unknown" - r = rating_map.get(d.name) - if r: - for c in r.categories: - if category and c.lower() != category.lower(): - continue - monthly[month][c] += 1 - all_cats.add(c) + if not all_cats: + console.print(f"[yellow]No data for category '{category}'[/]" if category + else "[yellow]No category data found.[/]") + return - if not all_cats: - console.print(f"[yellow]No data for category '{category}'[/]" if category - else "[yellow]No category data found.[/]") - return + months = sorted(m for m in monthly.keys() if m != "unknown") + cats = sorted(all_cats) - months = sorted(m for m in monthly.keys() if m != "unknown") - cats = sorted(all_cats) - - # Compute cumulative and growth - rows_data = [] - cumulative: dict[str, int] = defaultdict(int) - prev_count: dict[str, int] = defaultdict(int) - - for month in months: - for cat in cats: - count = monthly[month].get(cat, 0) - cumulative[cat] += count - growth = 0.0 - if prev_count[cat] > 0: - growth = ((count - prev_count[cat]) / prev_count[cat]) * 100 - rows_data.append({ - "month": month, - "category": cat, - "count": count, - "cumulative": cumulative[cat], - "growth_rate": growth, - }) - prev_count[cat] = count - - # Display summary table - console.print(f"\n[bold]Category Trends[/] — {len(months)} months, {len(cats)} categories\n") - - # Show per-category totals and recent momentum - table = Table(title="Category Growth Summary") - table.add_column("Category", style="cyan") - table.add_column("Total", justify="right", width=6) - table.add_column("Last 3mo", justify="right", width=8) - table.add_column("Prev 3mo", justify="right", width=8) - table.add_column("Growth", justify="right", width=8) - - recent_months = months[-3:] if len(months) >= 3 else months - prev_months = months[-6:-3] if len(months) >= 6 else [] + # Compute cumulative and growth + rows_data = [] + cumulative: dict[str, int] = defaultdict(int) + prev_count: dict[str, int] = defaultdict(int) + for month in months: for cat in cats: - total = cumulative[cat] - recent = sum(monthly[m].get(cat, 0) for m in recent_months) - prev = sum(monthly[m].get(cat, 0) for m in prev_months) if prev_months else 0 - if prev > 0: - growth_str = f"{((recent - prev) / prev) * 100:+.0f}%" - elif recent > 0: - growth_str = "new" - else: - growth_str = "-" - table.add_row(cat, str(total), str(recent), str(prev) if prev_months else "-", growth_str) + count = monthly[month].get(cat, 0) + cumulative[cat] += count + growth = 0.0 + if prev_count[cat] > 0: + growth = ((count - prev_count[cat]) / prev_count[cat]) * 100 + rows_data.append({ + "month": month, + "category": cat, + "count": count, + "cumulative": cumulative[cat], + "growth_rate": growth, + }) + prev_count[cat] = count - console.print(table) + # Display summary table + console.print(f"\n[bold]Category Trends[/] — {len(months)} months, {len(cats)} categories\n") - # Monthly detail - console.print(f"\n[bold]Monthly Breakdown[/]\n") - detail_table = Table() - detail_table.add_column("Month", style="dim", width=8) + # Show per-category totals and recent momentum + table = Table(title="Category Growth Summary") + table.add_column("Category", style="cyan") + table.add_column("Total", justify="right", width=6) + table.add_column("Last 3mo", justify="right", width=8) + table.add_column("Prev 3mo", justify="right", width=8) + table.add_column("Growth", justify="right", width=8) + + recent_months = months[-3:] if len(months) >= 3 else months + prev_months = months[-6:-3] if len(months) >= 6 else [] + + for cat in cats: + total = cumulative[cat] + recent = sum(monthly[m].get(cat, 0) for m in recent_months) + prev = sum(monthly[m].get(cat, 0) for m in prev_months) if prev_months else 0 + if prev > 0: + growth_str = f"{((recent - prev) / prev) * 100:+.0f}%" + elif recent > 0: + growth_str = "new" + else: + growth_str = "-" + table.add_row(cat, str(total), str(recent), str(prev) if prev_months else "-", growth_str) + + console.print(table) + + # Monthly detail + console.print(f"\n[bold]Monthly Breakdown[/]\n") + detail_table = Table() + detail_table.add_column("Month", style="dim", width=8) + for cat in cats: + detail_table.add_column(cat[:14], justify="right", width=max(6, len(cat[:14]))) + detail_table.add_column("Total", justify="right", width=6, style="bold") + + for month in months: + row = [month] + total = 0 for cat in cats: - detail_table.add_column(cat[:14], justify="right", width=max(6, len(cat[:14]))) - detail_table.add_column("Total", justify="right", width=6, style="bold") + c = monthly[month].get(cat, 0) + total += c + row.append(str(c) if c else "") + row.append(str(total)) + detail_table.add_row(*row) - for month in months: - row = [month] - total = 0 - for cat in cats: - c = monthly[month].get(cat, 0) - total += c - row.append(str(c) if c else "") - row.append(str(total)) - detail_table.add_row(*row) + console.print(detail_table) - console.print(detail_table) - - # Optional JSON output - if json_out: - out_path = Path(cfg.data_dir) / "reports" / "trends.json" - out_path.write_text(json_mod.dumps(rows_data, indent=2)) - console.print(f"\nJSON saved: [bold]{out_path}[/]") - - finally: - db.close() + # Optional JSON output + if json_out: + out_path = Path(cfg.data_dir) / "reports" / "trends.json" + out_path.write_text(json_mod.dumps(rows_data, indent=2)) + console.print(f"\nJSON saved: [bold]{out_path}[/]") # ── status ────────────────────────────────────────────────────────────── diff --git a/src/ietf_analyzer/config.py b/src/ietf_analyzer/config.py index 6c9c958..50f1d38 100644 --- a/src/ietf_analyzer/config.py +++ b/src/ietf_analyzer/config.py @@ -13,16 +13,15 @@ CONFIG_FILE = DEFAULT_DATA_DIR / "config.json" DEFAULT_KEYWORDS = [ "agent", "ai-agent", - "llm", - "autonomous", - "machine-learning", - "artificial-intelligence", - "mcp", "agentic", + "autonomous", + "mcp", "inference", "generative", "intelligent", - "aipref", + "large language model", + "multi-agent", + "trustworth", ] # Environment variable overrides (env var name -> config field name) @@ -39,6 +38,7 @@ class Config: db_path: str = str(DEFAULT_DATA_DIR / "drafts.db") ollama_url: str = "http://localhost:11434" ollama_embed_model: str = "nomic-embed-text" + ollama_classify_model: str = "llama3.2" claude_model: str = "claude-sonnet-4-20250514" claude_model_cheap: str = "claude-haiku-4-5-20251001" search_keywords: list[str] = field(default_factory=lambda: list(DEFAULT_KEYWORDS)) diff --git a/src/ietf_analyzer/db.py b/src/ietf_analyzer/db.py index 0233700..31611e1 100644 --- a/src/ietf_analyzer/db.py +++ b/src/ietf_analyzer/db.py @@ -326,6 +326,23 @@ class Database: return None return self._row_to_draft(row) + def get_drafts_by_names(self, names: list[str]) -> dict[str, "Draft"]: + """Batch-fetch drafts by name. Returns {name: Draft} dict.""" + if not names: + return {} + result = {} + # SQLite has a variable limit (~999), so chunk if needed + for i in range(0, len(names), 900): + chunk = names[i : i + 900] + placeholders = ",".join("?" for _ in chunk) + rows = self.conn.execute( + f"SELECT * FROM drafts WHERE name IN ({placeholders})", chunk + ).fetchall() + for r in rows: + d = self._row_to_draft(r) + result[d.name] = d + return result + def list_drafts( self, limit: int = 100, diff --git a/src/ietf_analyzer/embeddings.py b/src/ietf_analyzer/embeddings.py index feae4de..1e7b5db 100644 --- a/src/ietf_analyzer/embeddings.py +++ b/src/ietf_analyzer/embeddings.py @@ -2,6 +2,10 @@ from __future__ import annotations +import hashlib +import json +from pathlib import Path + import numpy as np import ollama as ollama_lib from rich.console import Console @@ -111,16 +115,49 @@ class Embedder: return similarities[:top_n] def similarity_matrix(self) -> tuple[list[str], np.ndarray]: - """Compute pairwise similarity matrix for all embedded drafts.""" + """Compute pairwise similarity matrix for all embedded drafts. + + Uses a file-based cache keyed by the hash of embedding draft names. + If the set of embedded drafts hasn't changed, the cached matrix is + reloaded from disk instead of recomputing O(n^2) cosine similarities. + """ all_embeddings = self.db.all_embeddings() names = sorted(all_embeddings.keys()) n = len(names) + + # Build cache key from sorted draft names + names_hash = hashlib.sha256("\n".join(names).encode()).hexdigest()[:16] + cache_dir = Path(self.config.db_path).parent / ".cache" + cache_meta = cache_dir / f"sim_matrix_{names_hash}.json" + cache_npy = cache_dir / f"sim_matrix_{names_hash}.npy" + + # Try loading from cache + if cache_meta.exists() and cache_npy.exists(): + try: + cached_names = json.loads(cache_meta.read_text()) + if cached_names == names: + matrix = np.load(cache_npy) + if matrix.shape == (n, n): + return names, matrix + except Exception: + pass # Cache corrupted, recompute + + # Compute fresh matrix = np.zeros((n, n), dtype=np.float32) for i in range(n): for j in range(i, n): sim = _cosine_similarity(all_embeddings[names[i]], all_embeddings[names[j]]) matrix[i, j] = sim matrix[j, i] = sim + + # Save to cache + try: + cache_dir.mkdir(exist_ok=True) + np.save(cache_npy, matrix) + cache_meta.write_text(json.dumps(names)) + except Exception: + pass # Non-fatal if caching fails + return names, matrix def find_clusters(self, threshold: float = 0.85) -> list[list[str]]: diff --git a/src/ietf_analyzer/readiness.py b/src/ietf_analyzer/readiness.py index 4af7353..6d72af3 100644 --- a/src/ietf_analyzer/readiness.py +++ b/src/ietf_analyzer/readiness.py @@ -100,3 +100,136 @@ def compute_readiness(db, draft_name: str) -> dict: f["contribution"] = round(f["value"] * f["weight"] * 100, 1) return {"score": score, "factors": factors} + + +def compute_readiness_batch(db, draft_names: list[str]) -> dict[str, dict]: + """Batch-compute readiness for multiple drafts using bulk queries. + + Returns {draft_name: {score, factors}} — same format as compute_readiness. + Reduces ~6 queries per draft to ~6 queries total. + """ + if not draft_names: + return {} + + # Batch-load drafts + drafts_map = db.get_drafts_by_names(draft_names) + + # Batch-load ref counts per draft + ref_counts: dict[str, int] = {} + rows = db.conn.execute( + "SELECT draft_name, COUNT(*) as cnt FROM draft_refs GROUP BY draft_name" + ).fetchall() + for r in rows: + ref_counts[r["draft_name"]] = r["cnt"] + + # Max refs across corpus (single query) + max_refs_row = db.conn.execute( + "SELECT MAX(cnt) FROM (SELECT COUNT(*) as cnt FROM draft_refs GROUP BY draft_name)" + ).fetchone() + max_refs = (max_refs_row[0] or 1) if max_refs_row else 1 + + # Batch-load cited-by counts + cited_by_counts: dict[str, int] = {} + rows = db.conn.execute( + "SELECT ref_id, COUNT(DISTINCT draft_name) as cnt FROM draft_refs " + "WHERE ref_type = 'draft' GROUP BY ref_id" + ).fetchall() + for r in rows: + cited_by_counts[r["ref_id"]] = r["cnt"] + + # Batch-load author experience: person_id -> draft count + author_draft_counts: dict[int, int] = {} + rows = db.conn.execute( + "SELECT person_id, COUNT(*) as cnt FROM draft_authors GROUP BY person_id" + ).fetchall() + for r in rows: + author_draft_counts[r["person_id"]] = r["cnt"] + + # Batch-load draft->author mappings + draft_authors: dict[str, list[int]] = {} + rows = db.conn.execute( + "SELECT draft_name, person_id FROM draft_authors" + ).fetchall() + for r in rows: + draft_authors.setdefault(r["draft_name"], []).append(r["person_id"]) + + # Batch-load ratings (momentum) + ratings_map: dict[str, float] = {} + rows = db.conn.execute( + "SELECT draft_name, momentum FROM ratings" + ).fetchall() + for r in rows: + ratings_map[r["draft_name"]] = r["momentum"] + + # Now compute readiness for each draft using pre-loaded data + results = {} + for name in draft_names: + draft = drafts_map.get(name) + if not draft: + results[name] = {"score": 0, "factors": {}} + continue + + factors = {} + + # 1. WG Adopted + wg_val = 1.0 if name.startswith("draft-ietf-") else 0.0 + factors["wg_adopted"] = {"value": wg_val, "weight": 0.25, + "label": "WG Adopted", + "detail": "draft-ietf-*" if wg_val else "individual"} + + # 2. Revision Maturity + try: + rev_num = int(draft.rev) if draft.rev else 0 + except (ValueError, TypeError): + rev_num = 0 + rev_val = min(rev_num / 5.0, 1.0) + factors["revision_maturity"] = {"value": round(rev_val, 3), "weight": 0.15, + "label": "Revision Maturity", + "detail": f"rev {rev_num}"} + + # 3. Reference Density + ref_count = ref_counts.get(name, 0) + ref_val = min(ref_count / max_refs, 1.0) + factors["reference_density"] = {"value": round(ref_val, 3), "weight": 0.15, + "label": "Reference Density", + "detail": f"{ref_count} refs (max {max_refs})"} + + # 4. Cited By Count + cited_by = cited_by_counts.get(name, 0) + cited_val = min(cited_by / 5.0, 1.0) + factors["cited_by_count"] = {"value": round(cited_val, 3), "weight": 0.15, + "label": "Cited By Others", + "detail": f"{cited_by} draft(s)"} + + # 5. Author Experience + person_ids = draft_authors.get(name, []) + if person_ids: + counts = [author_draft_counts.get(pid, 1) for pid in person_ids] + avg_exp = sum(counts) / len(counts) + exp_val = min(avg_exp / 5.0, 1.0) + else: + exp_val = 0.0 + avg_exp = 0 + factors["author_experience"] = {"value": round(exp_val, 3), "weight": 0.15, + "label": "Author Experience", + "detail": f"avg {avg_exp:.1f} drafts/author"} + + # 6. Momentum Rating + momentum = ratings_map.get(name) + if momentum is not None: + mom_val = (momentum - 1) / 4.0 + else: + mom_val = 0.0 + factors["momentum_rating"] = {"value": round(mom_val, 3), "weight": 0.15, + "label": "Momentum", + "detail": f"{momentum}/5" if momentum else "unrated"} + + # Compute weighted score + total = sum(f["value"] * f["weight"] for f in factors.values()) + score = round(total * 100, 1) + for f in factors.values(): + f["contribution"] = round(f["value"] * f["weight"] * 100, 1) + + results[name] = {"score": score, "factors": factors} + + return results diff --git a/src/ietf_analyzer/search.py b/src/ietf_analyzer/search.py index 24afb57..88faf53 100644 --- a/src/ietf_analyzer/search.py +++ b/src/ietf_analyzer/search.py @@ -4,6 +4,7 @@ from __future__ import annotations import hashlib import re +import time from collections import defaultdict import numpy as np @@ -50,6 +51,9 @@ class HybridSearch: self.db = db self._embedder = embedder self._ollama_available: bool | None = None + self._embeddings_cache: dict[str, np.ndarray] | None = None + self._embeddings_cache_time: float = 0 + self._EMBEDDINGS_TTL: float = 300 # 5 minutes @property def embedder(self): @@ -79,6 +83,16 @@ class HybridSearch: self._ollama_available = False return self._ollama_available + def _get_all_embeddings(self) -> dict[str, np.ndarray]: + """Return all embeddings, cached with TTL to avoid reloading on every query.""" + now = time.monotonic() + if (self._embeddings_cache is not None + and now - self._embeddings_cache_time < self._EMBEDDINGS_TTL): + return self._embeddings_cache + self._embeddings_cache = self.db.all_embeddings() + self._embeddings_cache_time = now + return self._embeddings_cache + def search(self, query: str, top_k: int = 10) -> list[dict]: """Combine FTS5 keyword search + embedding similarity search. @@ -144,7 +158,7 @@ class HybridSearch: self._ollama_available = False return [] - all_embeddings = self.db.all_embeddings() + all_embeddings = self._get_all_embeddings() if not all_embeddings: return [] diff --git a/src/webui/data.py b/src/webui/data.py index 6482a2b..c0cf915 100644 --- a/src/webui/data.py +++ b/src/webui/data.py @@ -7,11 +7,176 @@ ready for JSON serialization or Jinja2 template rendering. from __future__ import annotations import json +import re import sys import time from collections import Counter, defaultdict from functools import lru_cache from pathlib import Path +from typing import TypedDict + +import numpy as np +from sklearn.cluster import AgglomerativeClustering +from sklearn.manifold import TSNE +from sklearn.preprocessing import normalize as sk_normalize + + +# --------------------------------------------------------------------------- +# TypedDicts for common return shapes +# --------------------------------------------------------------------------- + +class OverviewStats(TypedDict): + """High-level dashboard statistics from :func:`get_overview_stats`.""" + total_drafts: int + rated_count: int + author_count: int + idea_count: int + gap_count: int + input_tokens: int + output_tokens: int + false_positive_count: int + + +class DraftListItem(TypedDict): + """Single draft in the paginated listing from :func:`get_drafts_page`.""" + name: str + title: str + date: str | None + url: str + pages: int + group: str + source: str + score: float + novelty: float + maturity: float + overlap: float + momentum: float + relevance: float + categories: list[str] + summary: str + readiness: float + + +class DraftsPage(TypedDict): + """Paginated draft listing from :func:`get_drafts_page`.""" + drafts: list[DraftListItem] + total: int + page: int + per_page: int + pages: int + + +class AuthorInfo(TypedDict): + """Author entry from :func:`get_top_authors`.""" + name: str + affiliation: str + draft_count: int + drafts: list[str] + + +class AuthorNetworkNode(TypedDict): + """Node in the author network graph.""" + id: str + name: str + org: str + draft_count: int + avg_score: float + drafts: list[str] + + +class AuthorNetworkEdge(TypedDict): + """Edge in the author network graph.""" + source: str + target: str + weight: int + + +class AuthorCluster(TypedDict): + """Cluster in the author network.""" + id: int + members: list[str] + org_mix: dict[str, int] + size: int + drafts: list[dict[str, str]] + draft_count: int + + +class AuthorNetwork(TypedDict): + """Full author network from :func:`get_author_network_full`.""" + nodes: list[AuthorNetworkNode] + edges: list[AuthorNetworkEdge] + clusters: list[AuthorCluster] + + +class SimilarityGraphStats(TypedDict): + """Stats sub-dict in similarity graph.""" + node_count: int + edge_count: int + avg_similarity: float + + +class SimilarityGraph(TypedDict): + """Draft similarity network from :func:`get_similarity_graph`.""" + nodes: list[dict] + edges: list[dict] + stats: SimilarityGraphStats + + +class TimelineData(TypedDict): + """Monthly category counts from :func:`get_timeline_data`.""" + months: list[str] + series: dict[str, list[int]] + categories: list[str] + + +class MonitorCost(TypedDict): + """Cost sub-dict in monitor status.""" + input_tokens: int + output_tokens: int + estimated_usd: float + + +class MonitorPipeline(TypedDict): + """Pipeline sub-dict in monitor status.""" + total_drafts: int + rated: int + embedded: int + with_ideas: int + idea_total: int + gap_count: int + + +class MonitorStatus(TypedDict): + """Monitor status from :func:`get_monitor_status`.""" + last_run: dict | None + runs: list[dict] + unprocessed: dict[str, int] + total_runs: int + pipeline: MonitorPipeline + cost: MonitorCost + + +class SearchResults(TypedDict): + """Global search results from :func:`global_search`.""" + drafts: list[dict] + ideas: list[dict] + authors: list[dict] + gaps: list[dict] + + +class CitationGraphStats(TypedDict): + """Stats sub-dict in citation graph.""" + node_count: int + edge_count: int + rfc_count: int + draft_count: int + + +class CitationGraph(TypedDict): + """Citation network from :func:`get_citation_graph`.""" + nodes: list[dict] + edges: list[dict] + stats: CitationGraphStats # Add project root to path so we can import ietf_analyzer _project_root = Path(__file__).resolve().parent.parent.parent @@ -20,6 +185,8 @@ if str(_project_root) not in sys.path: from ietf_analyzer.config import Config from ietf_analyzer.db import Database +from ietf_analyzer.readiness import compute_readiness, compute_readiness_batch +from ietf_analyzer.search import HybridSearch def _extract_month(time_str: str | None) -> str: """Normalize a date string to YYYY-MM format.""" @@ -55,7 +222,7 @@ def get_db() -> Database: return Database(config) -def get_overview_stats(db: Database) -> dict: +def get_overview_stats(db: Database) -> OverviewStats: """Return high-level stats for the dashboard home page. Excludes drafts flagged as false positives from rated counts. @@ -204,7 +371,7 @@ def get_drafts_page( sort: str = "score", sort_dir: str = "desc", source: str = "", -) -> dict: +) -> DraftsPage: """Return a paginated, filtered list of drafts with ratings. Returns dict with keys: drafts, total, page, per_page, pages. @@ -262,11 +429,9 @@ def get_drafts_page( start = (page - 1) * per_page page_items = filtered[start : start + per_page] - # Pre-compute readiness for page items (lightweight version) - from ietf_analyzer.readiness import compute_readiness - readiness_cache = {} - for draft, rating in page_items: - readiness_cache[draft.name] = compute_readiness(db, draft.name) + # Pre-compute readiness in batch (~6 queries total instead of ~200) + + readiness_cache = compute_readiness_batch(db, [d.name for d, _ in page_items]) drafts = [] for draft, rating in page_items: @@ -350,7 +515,7 @@ def get_draft_detail(db: Database, name: str) -> dict | None: } # Readiness score - from ietf_analyzer.readiness import compute_readiness + result["readiness"] = compute_readiness(db, name) # Annotation @@ -387,7 +552,7 @@ def get_rating_distributions(db: Database) -> dict: return dims -def get_timeline_data(db: Database) -> dict: +def get_timeline_data(db: Database) -> TimelineData: """Return monthly counts by category for timeline chart.""" pairs = db.drafts_with_ratings(limit=1000) all_drafts = db.list_drafts(limit=1000, order_by="time ASC") @@ -482,7 +647,7 @@ def read_generated_draft(filename: str) -> str | None: return path.read_text(errors="replace") -def get_top_authors(db: Database, limit: int = 30) -> list[dict]: +def get_top_authors(db: Database, limit: int = 30) -> list[AuthorInfo]: """Return top authors by draft count.""" rows = db.top_authors(limit=limit) return [ @@ -561,19 +726,19 @@ def get_coauthor_network(db: Database, min_shared: int = 1) -> dict: return {"nodes": nodes, "edges": edges} -def get_similarity_graph(db: Database, threshold: float = 0.75) -> dict: +def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: """Return draft similarity network (cached).""" return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold)) -def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> dict: +def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: """Return draft similarity network for force-directed graph. Returns {nodes: [{name, title, category, score}], edges: [{source, target, similarity}], stats: {node_count, edge_count, avg_similarity}} """ - import numpy as np + embeddings = db.all_embeddings() if len(embeddings) < 2: @@ -639,12 +804,12 @@ def get_cross_org_data(db: Database, limit: int = 20) -> list[dict]: ] -def get_author_network_full(db: Database) -> dict: +def get_author_network_full(db: Database) -> AuthorNetwork: """Return author network (cached for 5 min).""" return _cached("author_network", lambda: _compute_author_network_full(db)) -def _compute_author_network_full(db: Database) -> dict: +def _compute_author_network_full(db: Database) -> AuthorNetwork: """Return enriched co-authorship network with avg scores and cluster info. Returns { @@ -704,6 +869,12 @@ def _compute_author_network_full(db: Database) -> dict: visited: set[str] = set() clusters = [] + # Batch-load all drafts referenced by authors (avoid N+1 in cluster loop) + _all_dn = set() + for _ai in author_info.values(): + _all_dn.update(_ai.get("drafts", [])) + _all_drafts_map = db.get_drafts_by_names(list(_all_dn)) + for node in sorted(node_set): if node in visited: continue @@ -728,7 +899,7 @@ def _compute_author_network_full(db: Database) -> dict: org_mix[org] += 1 for dn in author_info.get(m, {}).get("drafts", []): if dn not in cluster_drafts: - d = db.get_draft(dn) + d = _all_drafts_map.get(dn) cluster_drafts[dn] = d.title[:80] if d else dn clusters.append({ "id": len(clusters), @@ -756,9 +927,7 @@ def _compute_idea_clusters(db: Database) -> dict: a target of ~30 clusters for readable groupings. Enriches each cluster with WG info and category breakdown. """ - import json as _json - import numpy as np - from sklearn.preprocessing import normalize as sk_normalize + embeddings = db.all_idea_embeddings() if not embeddings: @@ -777,8 +946,8 @@ def _compute_idea_clusters(db: Database) -> dict: draft_cats: dict[str, list[str]] = {} for r in rating_rows: try: - draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else [] - except (_json.JSONDecodeError, TypeError): + draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else [] + except (json.JSONDecodeError, TypeError): draft_cats[r["draft_name"]] = [] # Build matrix from embeddings that have matching ideas @@ -792,7 +961,6 @@ def _compute_idea_clusters(db: Database) -> dict: # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size n_target = max(10, min(40, len(idea_ids) // 12)) try: - from sklearn.cluster import AgglomerativeClustering clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward') labels = clustering.fit_predict(matrix_norm) except Exception: @@ -877,7 +1045,6 @@ def _compute_idea_clusters(db: Database) -> dict: # t-SNE for scatter scatter = [] try: - from sklearn.manifold import TSNE perp = min(30, len(idea_ids) - 1) tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500) coords = tsne.fit_transform(matrix_norm) @@ -917,7 +1084,7 @@ def _compute_timeline_animation_data(db: Database) -> dict: animation frames. Each point carries a ``month`` field (YYYY-MM) so the front-end can build cumulative animation frames. """ - import numpy as np + embeddings = db.all_embeddings() if len(embeddings) < 5: @@ -935,7 +1102,6 @@ def _compute_timeline_animation_data(db: Database) -> dict: matrix = np.array([embeddings[n] for n in names]) try: - from sklearn.manifold import TSNE tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), random_state=42, max_iter=500) coords = tsne.fit_transform(matrix) @@ -975,7 +1141,7 @@ def _compute_timeline_animation_data(db: Database) -> dict: } -def get_monitor_status(db: Database) -> dict: +def get_monitor_status(db: Database) -> MonitorStatus: """Return monitoring status data for dashboard.""" runs = db.get_monitor_runs(limit=20) last = runs[0] if runs else None @@ -1014,12 +1180,12 @@ def get_monitor_status(db: Database) -> dict: } -def get_citation_graph(db: Database, min_refs: int = 2) -> dict: +def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: """Return citation graph (cached for 5 min).""" return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs)) -def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict: +def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: """Return citation network data for force-directed graph. Returns {nodes: [{id, type, title, influence, ...}], @@ -1131,7 +1297,7 @@ def _compute_citation_graph(db: Database, min_refs: int = 2) -> dict: } -def global_search(db: Database, query: str) -> dict: +def global_search(db: Database, query: str) -> SearchResults: """Search across drafts (FTS5), ideas, authors, and gaps. Returns {drafts: [...], ideas: [...], authors: [...], gaps: [...]}. @@ -1144,7 +1310,6 @@ def global_search(db: Database, query: str) -> dict: # 1. Drafts via FTS5 try: - import re fts_query = re.sub(r'[^\w\s]', '', q) fts_query = re.sub(r'\b(NEAR|OR|AND|NOT)\b', '', fts_query, flags=re.IGNORECASE) fts_query = re.sub(r'\s+', ' ', fts_query).strip() @@ -1242,7 +1407,7 @@ def get_landscape_tsne(db: Database) -> list[dict]: def _compute_landscape_tsne(db: Database) -> list[dict]: """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].""" - import numpy as np + embeddings = db.all_embeddings() if len(embeddings) < 5: @@ -1260,7 +1425,6 @@ def _compute_landscape_tsne(db: Database) -> list[dict]: matrix = np.array([embeddings[n] for n in names]) try: - from sklearn.manifold import TSNE tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), random_state=42, max_iter=500) coords = tsne.fit_transform(matrix) @@ -1295,7 +1459,7 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None: comparison_text: str | None, } """ - import numpy as np + drafts_data = [] all_ideas: dict[str, list[dict]] = {} @@ -1384,9 +1548,6 @@ def get_comparison_data(db: Database, names: list[str]) -> dict | None: def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: """Search-only (free) — returns sources + cached answer if available.""" - from ietf_analyzer.config import Config - from ietf_analyzer.search import HybridSearch - config = Config.load() searcher = HybridSearch(config, db) return searcher.search_only(question, top_k=top_k) @@ -1394,9 +1555,6 @@ def get_ask_search(db: Database, question: str, top_k: int = 5) -> dict: def get_ask_synthesize(db: Database, question: str, top_k: int = 5, cheap: bool = True) -> dict: """Run Claude synthesis (costs tokens, result is cached permanently).""" - from ietf_analyzer.config import Config - from ietf_analyzer.search import HybridSearch - config = Config.load() searcher = HybridSearch(config, db) return searcher.ask(question, top_k=top_k, cheap=cheap) diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py new file mode 100644 index 0000000..80aa87c --- /dev/null +++ b/tests/test_analyzer.py @@ -0,0 +1,166 @@ +"""Tests for pure functions in ietf_analyzer.analyzer (no API calls).""" + +from __future__ import annotations + +import json + +import pytest + +from ietf_analyzer.analyzer import Analyzer +from ietf_analyzer.models import Rating + + +# ---- _extract_json ---- + + +class TestExtractJson: + """Test the _extract_json static-ish method that strips markdown fences.""" + + @staticmethod + def _extract(text: str) -> str: + # _extract_json is an instance method but only uses self for nothing, + # so we call it on a dummy — avoid constructing full Analyzer (needs API key). + return Analyzer._extract_json(None, text) + + def test_plain_json(self): + raw = '{"key": "value"}' + assert self._extract(raw) == '{"key": "value"}' + + def test_json_with_fences(self): + raw = '```json\n{"key": "value"}\n```' + assert self._extract(raw) == '{"key": "value"}' + + def test_json_with_plain_fences(self): + raw = '```\n{"key": "value"}\n```' + assert self._extract(raw) == '{"key": "value"}' + + def test_json_with_whitespace(self): + raw = ' \n {"key": "value"} \n ' + assert self._extract(raw) == '{"key": "value"}' + + def test_json_array_with_fences(self): + raw = '```json\n[{"a": 1}, {"b": 2}]\n```' + result = self._extract(raw) + assert json.loads(result) == [{"a": 1}, {"b": 2}] + + def test_multiline_json_with_fences(self): + raw = '```json\n{\n "key": "value",\n "num": 42\n}\n```' + result = self._extract(raw) + parsed = json.loads(result) + assert parsed == {"key": "value", "num": 42} + + def test_no_fences_passthrough(self): + raw = '[1, 2, 3]' + assert self._extract(raw) == '[1, 2, 3]' + + def test_empty_string(self): + assert self._extract('') == '' + + def test_fences_with_trailing_whitespace(self): + raw = '```json\n{"ok": true}\n``` \n' + result = self._extract(raw) + assert json.loads(result) == {"ok": True} + + +# ---- _clamp_rating ---- + + +class TestClampRating: + def test_normal_values(self): + assert Analyzer._clamp_rating(3) == 3 + assert Analyzer._clamp_rating(1) == 1 + assert Analyzer._clamp_rating(5) == 5 + + def test_clamp_high(self): + assert Analyzer._clamp_rating(10) == 5 + assert Analyzer._clamp_rating(99) == 5 + + def test_clamp_low(self): + assert Analyzer._clamp_rating(0) == 1 + assert Analyzer._clamp_rating(-5) == 1 + + def test_float_truncated(self): + assert Analyzer._clamp_rating(3.7) == 3 + assert Analyzer._clamp_rating(4.9) == 4 + + def test_string_number(self): + assert Analyzer._clamp_rating("4") == 4 + assert Analyzer._clamp_rating("1") == 1 + + def test_invalid_returns_default(self): + assert Analyzer._clamp_rating("abc") == 3 + assert Analyzer._clamp_rating(None) == 3 + assert Analyzer._clamp_rating([]) == 3 + + def test_custom_default(self): + assert Analyzer._clamp_rating("abc", default=2) == 2 + + def test_custom_range(self): + assert Analyzer._clamp_rating(8, lo=1, hi=10) == 8 + assert Analyzer._clamp_rating(15, lo=1, hi=10) == 10 + + +# ---- _parse_rating ---- + + +class TestParseRating: + """Test _parse_rating with compact and verbose key formats.""" + + @staticmethod + def _parse(draft_name: str, data: dict) -> Rating: + # _parse_rating calls self._clamp_rating, so we need a minimal object. + # Create an object with just the _clamp_rating method bound. + stub = object.__new__(Analyzer) + return stub._parse_rating(draft_name, data) + + def test_compact_keys(self): + data = { + "s": "A summary", + "n": 4, "nn": "novel approach", + "m": 3, "mn": "early stage", + "o": 2, "on": "minor overlap", + "mo": 5, "mon": "strong momentum", + "r": 4, "rn": "relevant", + "c": ["A2A protocols"], + } + rating = self._parse("draft-test", data) + assert rating.draft_name == "draft-test" + assert rating.novelty == 4 + assert rating.maturity == 3 + assert rating.overlap == 2 + assert rating.momentum == 5 + assert rating.relevance == 4 + assert rating.summary == "A summary" + assert rating.categories == ["A2A protocols"] + + def test_verbose_keys(self): + data = { + "summary": "A summary", + "novelty": 3, "novelty_note": "ok", + "maturity": 2, "maturity_note": "early", + "overlap": 1, "overlap_note": "unique", + "momentum": 4, "momentum_note": "active", + "relevance": 5, "relevance_note": "core", + "categories": ["AI safety/alignment"], + } + rating = self._parse("draft-test-2", data) + assert rating.novelty == 3 + assert rating.relevance == 5 + assert rating.categories == ["AI safety/alignment"] + + def test_missing_keys_use_defaults(self): + data = {} + rating = self._parse("draft-empty", data) + assert rating.novelty == 3 # default + assert rating.maturity == 3 + assert rating.summary == "" + assert rating.categories == [] + + def test_out_of_range_clamped(self): + data = {"n": 99, "m": -1, "o": 0, "mo": 10, "r": 6} + rating = self._parse("draft-clamp", data) + assert rating.novelty == 5 + assert rating.maturity == 1 + assert rating.overlap == 1 + assert rating.momentum == 5 + assert rating.relevance == 5 diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..14d4203 --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,112 @@ +"""Tests for ietf_analyzer.search — sanitize_fts_query.""" + +from __future__ import annotations + +import pytest + +from ietf_analyzer.search import HybridSearch + + +class TestSanitizeFtsQuery: + """Test FTS5 query sanitization against injection and edge cases.""" + + def test_plain_query(self): + assert HybridSearch.sanitize_fts_query("agent protocol") == "agent protocol" + + def test_strips_quotes(self): + result = HybridSearch.sanitize_fts_query('"agent" OR "protocol"') + assert '"' not in result + assert "agent" in result + + def test_strips_parentheses(self): + result = HybridSearch.sanitize_fts_query("(agent AND protocol)") + assert "(" not in result + assert ")" not in result + + def test_strips_asterisk(self): + result = HybridSearch.sanitize_fts_query("agent*") + assert "*" not in result + assert "agent" in result + + def test_removes_boolean_OR(self): + result = HybridSearch.sanitize_fts_query("agent OR protocol") + assert "OR" not in result + assert "agent" in result + assert "protocol" in result + + def test_removes_boolean_AND(self): + result = HybridSearch.sanitize_fts_query("agent AND protocol") + assert "AND" not in result + + def test_removes_boolean_NOT(self): + result = HybridSearch.sanitize_fts_query("agent NOT malicious") + assert "NOT" not in result + assert "malicious" in result + + def test_removes_NEAR(self): + result = HybridSearch.sanitize_fts_query("agent NEAR protocol") + assert "NEAR" not in result + + def test_case_insensitive_operators(self): + result = HybridSearch.sanitize_fts_query("agent or protocol") + assert " or " not in result + # "or" as standalone word should be removed + words = result.split() + assert "or" not in [w.lower() for w in words] + + def test_injection_attempt_column_filter(self): + """FTS5 column filter syntax should be stripped.""" + result = HybridSearch.sanitize_fts_query("title:agent") + # The colon is stripped, leaving just "titleagent" or "title agent" + assert ":" not in result + + def test_injection_attempt_special_chars(self): + result = HybridSearch.sanitize_fts_query('"; DROP TABLE drafts; --') + assert ";" not in result + assert '"' not in result + assert "--" not in result + + def test_empty_query(self): + assert HybridSearch.sanitize_fts_query("") == "" + + def test_only_operators(self): + result = HybridSearch.sanitize_fts_query("OR AND NOT") + assert result.strip() == "" + + def test_only_special_chars(self): + result = HybridSearch.sanitize_fts_query('"*(){}[]') + assert result.strip() == "" + + def test_collapses_whitespace(self): + result = HybridSearch.sanitize_fts_query("agent protocol test") + assert result == "agent protocol test" + + def test_preserves_numbers(self): + result = HybridSearch.sanitize_fts_query("rfc 8259") + assert result == "rfc 8259" + + def test_preserves_underscores(self): + result = HybridSearch.sanitize_fts_query("ai_agent_protocol") + assert result == "ai_agent_protocol" + + def test_unicode_preserved(self): + """Non-ASCII alphanumeric characters should be preserved.""" + result = HybridSearch.sanitize_fts_query("müller agent") + assert "müller" in result or "mller" in result # depends on \w locale + + def test_mixed_injection(self): + """Complex injection attempt with multiple vectors.""" + result = HybridSearch.sanitize_fts_query( + '(agent* NEAR/5 "protocol") OR title:exploit NOT safe' + ) + # NEAR/5 becomes NEAR5 after stripping the slash, which is no longer + # a standalone NEAR operator — it's just a harmless token. + assert "OR" not in result.split() + assert "NOT" not in result.split() + assert "*" not in result + assert '"' not in result + assert "(" not in result + assert ":" not in result + # Core words should survive + assert "agent" in result + assert "protocol" in result