"""Claude-based analysis — summarization, rating, categorization, overlap detection.""" from __future__ import annotations import hashlib import json from datetime import datetime, timezone from pathlib import Path from dotenv import load_dotenv # Load .env from project root (two levels up from this file, or cwd) load_dotenv(Path(__file__).resolve().parent.parent.parent / ".env") load_dotenv() # Also check cwd import anthropic from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn from .config import Config from .db import Database from .models import Draft, Rating console = Console() CATEGORIES_SHORT = [ "A2A protocols", # Agent-to-agent communication protocols "AI safety/alignment", # AI safety / guardrails / alignment "ML traffic mgmt", # ML-based traffic management / optimization "Autonomous netops", # Autonomous network operations "Agent identity/auth", # Identity / authentication for AI agents "Data formats/interop",# Data formats / semantics for AI interop "Policy/governance", # Policy / governance / ethical frameworks "Model serving/inference", # AI model serving / inference protocols "Agent discovery/reg", # Agent discovery / registration "Human-agent interaction", "Other AI/agent", ] # ============================================================================ # METHODOLOGY NOTE — LLM-as-Judge Rating Approach # # Limitations of this rating system (see also data/reports/methodology.md): # # 1. ABSTRACT-ONLY: Ratings are generated from the draft's abstract (truncated # to 2000 chars), not the full text. Maturity and overlap scores in # particular may be unreliable when the abstract omits key details. # # 2. NO HUMAN CALIBRATION: No inter-rater reliability study has been performed. # Claude is the sole judge; scores have not been validated against human # expert ratings. Even a small calibration set (20-30 drafts) would # substantially strengthen confidence in the ratings. # # 3. NO INTRA-RATER CONSISTENCY CHECK: The same draft is never re-rated to # measure Claude's self-consistency. Prompt-hash caching means re-runs # return cached results, so actual consistency is untested. # # 4. OVERLAP SCORE LIMITATION: The overlap dimension asks Claude whether a # draft overlaps with other known work, but Claude rates each draft # independently — it does not have access to the full corpus during rating. # The overlap score reflects Claude's general knowledge, not corpus-specific # similarity. Use embedding-based similarity for corpus-level overlap. # # 5. BATCH EFFECTS: Batch rating (BATCH_PROMPT) processes multiple drafts # together. Position effects and comparison effects are uncontrolled. # Abstracts are also truncated more aggressively (1500 chars vs 2000). # # 6. RELEVANCE INFLATION: The relevance distribution is right-skewed because # keyword-matched drafts tend to score high on relevance by construction. # The corpus likely contains 30-50 false positives from ambiguous keywords # like "agent" (user agent), "autonomous" (autonomous systems), and # "intelligent" (intelligent networking). # # INTERPRETATION: Scores should be treated as RELATIVE RANKINGS within this # corpus, not as absolute quality measures. A score of 4.0 means "above # average for this corpus," not "objectively high quality." # ============================================================================ # Compact prompt — abstract only, saves ~10x tokens vs full-text RATE_PROMPT_COMPACT = """\ Rate this {doc_type}. JSON only. {name} | {title} | {time} | {pages}pg Abstract: {abstract} Return JSON: {{"s":"2-3 sentence summary","n":<1-5>,"nn":"novelty note","m":<1-5>,"mn":"maturity note","o":<1-5>,"on":"overlap note","mo":<1-5>,"mon":"momentum note","r":<1-5>,"rn":"relevance note","c":["categories"]}} Rating scale (use the FULL range 1-5, avoid clustering at 3-4): - Novelty: 1=trivial/obvious extension, 2=incremental, 3=useful contribution, 4=notable originality, 5=genuinely novel approach - Maturity: 1=problem statement only, 2=early sketch, 3=defined protocol/mechanism, 4=detailed spec with examples, 5=implementation-ready with test vectors - Overlap: 1=unique approach, 2=minor similarities, 3=shares concepts with 1-2 drafts, 4=significant overlap, 5=near-duplicate of existing work - Momentum: 1=inactive/abandoned, 2=single revision, 3=active development, 4=WG interest/adoption, 5=strong community momentum - Relevance: 1=not about AI/agents (false positive), 2=tangentially related, 3=partially relevant, 4=directly relevant, 5=core AI agent topic Categories: {categories} JSON only, no fences.""" # Batch prompt — rate multiple drafts in one call BATCH_PROMPT = """\ Rate each document below. Return a JSON array with one object per draft, in order. {drafts_block} Per-draft JSON: {{"name":"draft-name","s":"2-3 sentence summary","n":<1-5>,"nn":"novelty note","m":<1-5>,"mn":"maturity note","o":<1-5>,"on":"overlap with known drafts","mo":<1-5>,"mon":"momentum note","r":<1-5>,"rn":"relevance note","c":["categories"]}} Scale: 1=very low..5=very high. Overlap: 1=unique,5=heavy overlap. Categories: {categories} Return ONLY a JSON array, no fences.""" COMPARE_PROMPT = """\ Compare these documents — overlaps, unique ideas, complementary vs competing vs redundant. {drafts_section} Be specific about concrete mechanisms and design choices.""" EXTRACT_IDEAS_PROMPT = """\ Extract discrete technical ideas and mechanisms from this {doc_type}. Return a JSON array. Each element: {{"title":"short name","description":"1-2 sentences","type":"mechanism|protocol|pattern|requirement|architecture|extension"}} {name} | {title} | {pages}pg Abstract: {abstract} {text_excerpt} Return 1-4 ideas. Extract only TOP-LEVEL novel contributions. Do NOT list sub-features, optimizations, variants, or extensions as separate ideas. If a draft defines one protocol with multiple features, that is ONE idea, not several. Each idea must be independently novel — could it be its own draft? If not, merge it with the parent idea. Only include CONCRETE, NOVEL technical contributions — not restatements of the abstract or general goals. If the draft has no substantive technical ideas (e.g. it is a problem statement, administrative document, or off-topic), return an empty array []. JSON array only, no fences.""" BATCH_IDEAS_PROMPT = """\ Extract ideas from each document below. Return a JSON object mapping document name -> array of ideas. Per idea: {{"title":"short name","description":"1 sentence","type":"mechanism|protocol|pattern|requirement|architecture|extension"}} {drafts_block} 1-4 ideas per draft. Extract only TOP-LEVEL novel contributions. Do NOT list sub-features, optimizations, variants, or extensions as separate ideas. If a draft defines one protocol with multiple features, that is ONE idea, not several. Each idea must be independently novel — could it be its own draft? If not, merge it with the parent idea. Only include CONCRETE, NOVEL technical contributions. If a draft has no substantive ideas, map it to an empty array. Do not pad with restatements of the abstract. Return ONLY a JSON object like {{"draft-name":[...], ...}}, no fences.""" # ============================================================================ # GAP ANALYSIS METHODOLOGY NOTE # # This is a SINGLE-SHOT LLM analysis: Claude receives compressed statistics # about the landscape (category counts, top ideas, overlap summary) and # generates gaps in one pass. Limitations: # # 1. No systematic coverage analysis against a reference taxonomy. A rigorous # approach would compare the corpus against an explicit reference architecture # (e.g., NIST AI RMF, FIPA agent platform model, or a custom agent ecosystem # reference model) to identify gaps systematically rather than relying on # Claude's general knowledge. # # 2. The overlap_summary fed to the prompt is category-level only — it does not # tell Claude which specific technical areas overlap within categories. # # 3. Evidence quality varies: some gaps cite specific data ("only N drafts"), # others are based on Claude's inference about what is missing. # # 4. Gap severity is assigned by Claude in a single pass without defined # thresholds (what makes "critical" vs "high" is implicit). # # Strengthening options: ground against a reference architecture, run multiple # independent gap analyses and intersect results, have domain experts validate. # ============================================================================ GAP_ANALYSIS_PROMPT = """\ You are analyzing the landscape of {total} IETF Internet-Drafts related to AI agents and autonomous systems. ## Categories and Draft Counts {category_summary} ## Most Common Technical Ideas {top_ideas} ## Known Overlap Clusters (groups of highly similar drafts) {overlap_summary} Identify 8-15 GAPS — areas, problems, or technical challenges NOT adequately addressed by existing drafts. Return a JSON array: [{{"topic":"short topic name","description":"2-3 sentence description","category":"closest category or new","severity":"critical|high|medium|low","evidence":"what suggests this gap matters"}}] Focus on: 1. Problems mentioned but not solved 2. Missing infrastructure pieces 3. Security/privacy/safety issues not addressed 4. Interoperability gaps between competing proposals 5. Real-world deployment concerns ignored JSON array only, no fences.""" SCORE_NOVELTY_PROMPT = """\ Rate each idea's novelty/originality on a 1-5 scale. 1 = Generic building block anyone would include (e.g. "Agent Gateway", "Certificate Authority") 2 = Obvious extension of existing work, minimal originality 3 = Useful and relevant but expected given the problem space 4 = Interesting contribution with some original thinking 5 = Genuinely novel mechanism, protocol, or architectural insight Ideas to score: {ideas_block} Return ONLY a JSON object mapping idea ID to score, like {{"123": 3, "456": 1, ...}}. No fences, no explanation.""" def _prompt_hash(text: str) -> str: return hashlib.sha256(text.encode()).hexdigest()[:16] def _doc_type_label(source: str) -> str: """Return a human-readable document type based on source.""" labels = { "ietf": "IETF draft", "w3c": "W3C specification", } return labels.get(source, f"{source} document") class Analyzer: def __init__(self, config: Config | None = None, db: Database | None = None): self.config = config or Config.load() self.db = db or Database(self.config) try: self.client = anthropic.Anthropic() except Exception: console.print( "[red bold]No Anthropic API key found.[/]\n" "Set ANTHROPIC_API_KEY environment variable or run:\n" " export ANTHROPIC_API_KEY=sk-ant-..." ) raise SystemExit(1) @staticmethod def _clamp_rating(value, default: int = 3, lo: int = 1, hi: int = 5) -> int: """Clamp a rating value to [lo, hi] integers.""" try: return max(lo, min(hi, int(value))) except (ValueError, TypeError): return default def _parse_rating(self, draft_name: str, data: dict) -> Rating: """Parse a rating from compact JSON keys.""" return Rating( draft_name=draft_name, novelty=self._clamp_rating(data.get("n", data.get("novelty", 3))), maturity=self._clamp_rating(data.get("m", data.get("maturity", 3))), overlap=self._clamp_rating(data.get("o", data.get("overlap", 3))), momentum=self._clamp_rating(data.get("mo", data.get("momentum", 3))), relevance=self._clamp_rating(data.get("r", data.get("relevance", 3))), summary=data.get("s", data.get("summary", "")), novelty_note=data.get("nn", data.get("novelty_note", "")), maturity_note=data.get("mn", data.get("maturity_note", "")), overlap_note=data.get("on", data.get("overlap_note", "")), momentum_note=data.get("mon", data.get("momentum_note", "")), relevance_note=data.get("rn", data.get("relevance_note", "")), categories=data.get("c", data.get("categories", [])), rated_at=datetime.now(timezone.utc).isoformat(), ) def _call_claude(self, prompt: str, max_tokens: int = 512, cheap: bool = False) -> tuple[str, int, int]: """Call Claude and return (text, input_tokens, output_tokens). Args: cheap: If True, use claude_model_cheap (Haiku) for lower cost. """ model = self.config.claude_model_cheap if cheap else self.config.claude_model resp = self.client.messages.create( model=model, max_tokens=max_tokens, messages=[{"role": "user", "content": prompt}], ) text = resp.content[0].text.strip() return text, resp.usage.input_tokens, resp.usage.output_tokens def _extract_json(self, text: str) -> str: """Strip markdown fences if present.""" text = text.strip() if text.startswith("```"): text = text.split("\n", 1)[1] if text.rstrip().endswith("```"): text = text.rstrip()[:-3] return text.strip() def rate_draft(self, draft_name: str, use_cache: bool = True) -> Rating | None: """Analyze and rate a single draft.""" draft = self.db.get_draft(draft_name) if draft is None: console.print(f"[red]Draft not found: {draft_name}[/]") return None prompt = RATE_PROMPT_COMPACT.format( doc_type=_doc_type_label(draft.source), name=draft.name, title=draft.title, time=draft.date, pages=draft.pages or "?", abstract=draft.abstract[:2000], categories=", ".join(CATEGORIES_SHORT), ) phash = _prompt_hash(prompt) # Check cache if use_cache: cached = self.db.get_cached_response(draft_name, phash) if cached: try: data = json.loads(cached) rating = self._parse_rating(draft_name, data) self.db.upsert_rating(rating) draft.categories = rating.categories self.db.upsert_draft(draft) return rating except (json.JSONDecodeError, KeyError): pass # Re-analyze if cache is corrupt try: text, in_tok, out_tok = self._call_claude(prompt, max_tokens=512) text = self._extract_json(text) data = json.loads(text) # Cache the raw response self.db.cache_response( draft_name, phash, self.config.claude_model, prompt, text, in_tok, out_tok, ) except (json.JSONDecodeError, anthropic.APIError, IndexError, KeyError) as e: console.print(f"[red]Failed {draft_name}: {e}[/]") return None rating = self._parse_rating(draft_name, data) self.db.upsert_rating(rating) draft.categories = rating.categories self.db.upsert_draft(draft) return rating def rate_batch(self, drafts: list[Draft], batch_size: int = 5) -> int: """Rate multiple drafts in batched API calls to save tokens.""" count = 0 for i in range(0, len(drafts), batch_size): batch = drafts[i:i + batch_size] # Build batch prompt drafts_block = "" for d in batch: drafts_block += f"\n---\n{d.name} | {d.title} | {d.date} | {d.pages or '?'}pg\nAbstract: {d.abstract[:1500]}\n" prompt = BATCH_PROMPT.format( drafts_block=drafts_block, categories=", ".join(CATEGORIES_SHORT), ) phash = _prompt_hash(prompt) try: text, in_tok, out_tok = self._call_claude( prompt, max_tokens=400 * len(batch) ) text = self._extract_json(text) results = json.loads(text) if not isinstance(results, list): results = [results] for j, data in enumerate(results): draft_name = data.get("name", batch[j].name if j < len(batch) else None) if not draft_name: continue # Cache each result individually self.db.cache_response( draft_name, _prompt_hash(f"batch-{phash}-{draft_name}"), self.config.claude_model, f"batch[{i}]", json.dumps(data), in_tok // len(results), out_tok // len(results), ) rating = self._parse_rating(draft_name, data) self.db.upsert_rating(rating) draft = self.db.get_draft(draft_name) if draft: draft.categories = rating.categories self.db.upsert_draft(draft) count += 1 except (json.JSONDecodeError, anthropic.APIError) as e: console.print(f"[red]Batch {i//batch_size+1} failed: {e}[/]") # Fallback: rate individually for d in batch: r = self.rate_draft(d.name) if r: count += 1 return count def rate_all_unrated(self, limit: int = 300, batch_size: int = 5) -> int: """Rate all drafts that haven't been rated yet, using batching.""" unrated = self.db.unrated_drafts(limit=limit) if not unrated: console.print("All drafts already rated.") return 0 console.print(f"Rating [bold]{len(unrated)}[/] drafts in batches of {batch_size}...") count = 0 failures: list[tuple[str, str]] = [] with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), console=console, ) as progress: task = progress.add_task("Analyzing...", total=len(unrated)) for i in range(0, len(unrated), batch_size): batch = unrated[i:i + batch_size] names = ", ".join(d.name.split("-")[-1][:12] for d in batch) progress.update(task, description=f"Batch: {names}") try: n = self.rate_batch(batch, batch_size=batch_size) count += n except Exception as e: batch_names = [d.name for d in batch] for bn in batch_names: failures.append((bn, str(e))) console.print(f"[red]Batch failed: {e}[/]") progress.advance(task, advance=len(batch)) in_tok, out_tok = self.db.total_tokens_used() total_attempted = len(unrated) console.print( f"Rated [bold green]{count}[/] drafts " f"| Total tokens used: {in_tok:,} in + {out_tok:,} out" ) if failures: console.print( f"[yellow]Processed {count}/{total_attempted} drafts, " f"{len(failures)} failure(s):[/]" ) for name, err in failures[:20]: console.print(f" [red]{name}[/]: {err}") return count def extract_ideas(self, draft_name: str, use_cache: bool = True) -> list[dict] | None: """Extract technical ideas from a single draft.""" draft = self.db.get_draft(draft_name) if draft is None: console.print(f"[red]Draft not found: {draft_name}[/]") return None text_excerpt = "" if draft.full_text: text_excerpt = draft.full_text[:3000] prompt = EXTRACT_IDEAS_PROMPT.format( doc_type=_doc_type_label(draft.source), name=draft.name, title=draft.title, pages=draft.pages or "?", abstract=draft.abstract[:2000], text_excerpt=text_excerpt, ) phash = _prompt_hash("ideas-" + prompt) if use_cache: cached = self.db.get_cached_response(draft_name, phash) if cached: try: ideas = json.loads(cached) if isinstance(ideas, list): self.db.insert_ideas(draft_name, ideas) return ideas except (json.JSONDecodeError, KeyError): pass try: text, in_tok, out_tok = self._call_claude(prompt, max_tokens=1024) text = self._extract_json(text) ideas = json.loads(text) if not isinstance(ideas, list): ideas = [ideas] self.db.cache_response( draft_name, phash, self.config.claude_model, prompt, text, in_tok, out_tok, ) self.db.insert_ideas(draft_name, ideas) return ideas except (json.JSONDecodeError, anthropic.APIError) as e: console.print(f"[red]Failed ideas for {draft_name}: {e}[/]") return None def extract_ideas_batch(self, draft_names: list[str], cheap: bool = True) -> int: """Extract ideas from multiple drafts in a single API call. Uses batching to share prompt overhead — ~5x fewer API calls, ~3x fewer tokens than individual extraction. """ drafts = [] for name in draft_names: d = self.db.get_draft(name) if d: drafts.append(d) if not drafts: return 0 # Build compact batch block — abstract only (no full text for batch) drafts_block = "" for d in drafts: drafts_block += f"\n---\n{d.name} | {d.title}\nAbstract: {d.abstract[:800]}\n" prompt = BATCH_IDEAS_PROMPT.format(drafts_block=drafts_block) phash = _prompt_hash(prompt) try: text, in_tok, out_tok = self._call_claude( prompt, max_tokens=400 * len(drafts), cheap=cheap ) text = self._extract_json(text) results = json.loads(text) if not isinstance(results, dict): # Fallback: if it returned a list, try to match by order if isinstance(results, list) and len(results) == len(drafts): results = {d.name: r for d, r in zip(drafts, results)} else: return 0 count = 0 for d in drafts: ideas = results.get(d.name, []) if not isinstance(ideas, list): ideas = [ideas] if ideas else [] self.db.cache_response( d.name, _prompt_hash(f"batch-ideas-{phash}-{d.name}"), self.config.claude_model_cheap if cheap else self.config.claude_model, f"batch-ideas[{d.name}]", json.dumps(ideas), in_tok // len(drafts), out_tok // len(drafts), ) self.db.insert_ideas(d.name, ideas) if ideas: count += 1 return count except (json.JSONDecodeError, anthropic.APIError) as e: console.print(f"[red]Batch ideas failed: {e}[/]") return 0 def extract_all_ideas(self, limit: int = 300, batch_size: int = 5, cheap: bool = True) -> int: """Extract ideas from all drafts that don't have them yet. Args: batch_size: Number of drafts per API call (default 5). Set to 1 to use individual calls with full text. cheap: Use Haiku model for ~10x lower cost (default True). """ missing = self.db.drafts_without_ideas(limit=limit) if not missing: console.print("All drafts already have extracted ideas.") return 0 model_label = "Haiku" if cheap else "Sonnet" if batch_size > 1: console.print( f"Extracting ideas from [bold]{len(missing)}[/] drafts " f"(batches of {batch_size}, {model_label})..." ) else: console.print(f"Extracting ideas from [bold]{len(missing)}[/] drafts ({model_label})...") count = 0 failures: list[tuple[str, str]] = [] with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), console=console, ) as progress: task = progress.add_task("Extracting ideas...", total=len(missing)) if batch_size > 1: for i in range(0, len(missing), batch_size): batch = missing[i:i + batch_size] names = ", ".join(n.split("-")[-1][:10] for n in batch) progress.update(task, description=f"Batch: {names}") try: n = self.extract_ideas_batch(batch, cheap=cheap) count += n except Exception as e: for bn in batch: failures.append((bn, str(e))) console.print(f"[red]Batch failed: {e}[/]") progress.advance(task, advance=len(batch)) else: for name in missing: progress.update(task, description=f"Ideas: {name.split('-')[-1][:15]}") try: result = self.extract_ideas(name) if result: count += 1 except Exception as e: failures.append((name, str(e))) console.print(f"[red]Failed {name}: {e}[/]") progress.advance(task) total_attempted = len(missing) in_tok, out_tok = self.db.total_tokens_used() console.print( f"Extracted ideas from [bold green]{count}[/] drafts " f"({self.db.idea_count()} total ideas) " f"| Tokens: {in_tok:,} in + {out_tok:,} out" ) if failures: console.print( f"[yellow]Processed {count}/{total_attempted} drafts, " f"{len(failures)} failure(s):[/]" ) for name, err in failures[:20]: console.print(f" [red]{name}[/]: {err}") return count def gap_analysis(self) -> list[dict]: """Analyze the full landscape and identify gaps.""" # Build compressed landscape summary pairs = self.db.drafts_with_ratings(limit=500) total = self.db.count_drafts() # Category summary from collections import defaultdict cat_counts: dict[str, int] = defaultdict(int) for _, rating in pairs: for c in rating.categories: cat_counts[c] += 1 category_summary = "\n".join(f"- {c}: {n} drafts" for c, n in sorted(cat_counts.items(), key=lambda x: x[1], reverse=True)) # Top ideas (if available) all_ideas = self.db.all_ideas() idea_freq: dict[str, int] = defaultdict(int) for idea in all_ideas: idea_freq[idea["title"]] += 1 top_ideas_list = sorted(idea_freq.items(), key=lambda x: x[1], reverse=True)[:20] if top_ideas_list: top_ideas = "\n".join(f"- {title} ({count} drafts)" for title, count in top_ideas_list) else: top_ideas = "(No idea extraction data available yet)" # Overlap summary — use clusters report if it exists overlap_summary = "Multiple clusters of near-duplicate drafts exist, particularly in:\n" for c, n in sorted(cat_counts.items(), key=lambda x: x[1], reverse=True)[:5]: overlap_summary += f"- {c} ({n} drafts, high internal overlap)\n" prompt = GAP_ANALYSIS_PROMPT.format( total=total, category_summary=category_summary, top_ideas=top_ideas, overlap_summary=overlap_summary, ) phash = _prompt_hash(prompt) # Check cache cached = self.db.get_cached_response("_landscape_", phash) if cached: try: gaps = json.loads(cached) if isinstance(gaps, list): self.db.insert_gaps(gaps) return gaps except (json.JSONDecodeError, KeyError): pass try: text, in_tok, out_tok = self._call_claude(prompt, max_tokens=4096) text = self._extract_json(text) gaps = json.loads(text) if not isinstance(gaps, list): gaps = [gaps] self.db.cache_response( "_landscape_", phash, self.config.claude_model, prompt, text, in_tok, out_tok, ) self.db.insert_gaps(gaps) return gaps except (json.JSONDecodeError, anthropic.APIError) as e: console.print(f"[red]Gap analysis failed: {e}[/]") return [] def compare_drafts(self, draft_names: list[str], use_cache: bool = True) -> dict: """Compare multiple drafts and return structured comparison. Returns dict with keys: text, drafts (list of names that were compared), or a dict with key 'error' on failure. """ valid_names = [] parts = [] for name in draft_names: draft = self.db.get_draft(name) if draft is None: console.print(f"[yellow]Skipping unknown draft: {name}[/]") continue valid_names.append(name) parts.append(f"### {draft.title}\n**{name}**\n{draft.abstract}") if len(parts) < 2: return {"error": "Need at least 2 valid drafts to compare.", "drafts": valid_names} prompt = COMPARE_PROMPT.format( drafts_section="\n\n---\n\n".join(parts) ) phash = _prompt_hash(prompt) cache_key = "_compare_" + "_".join(sorted(valid_names)) # Check cache if use_cache: cached = self.db.get_cached_response(cache_key, phash) if cached: return {"text": cached, "drafts": valid_names} try: text, in_tok, out_tok = self._call_claude(prompt, max_tokens=2048) # Cache the result self.db.cache_response( cache_key, phash, self.config.claude_model, prompt, text, in_tok, out_tok, ) return {"text": text, "drafts": valid_names} except anthropic.APIError as e: return {"error": f"API error: {e}", "drafts": valid_names} def dedup_ideas(self, threshold: float = 0.85, dry_run: bool = True, draft_name: str | None = None) -> dict: """Deduplicate ideas within each draft using embedding similarity. For each draft, computes pairwise cosine similarity of idea embeddings. Ideas above the threshold are merged (keeping the one with the longer description). Args: threshold: Cosine similarity threshold for merging (default 0.85). dry_run: If True, report what would be merged without deleting. draft_name: If provided, only dedup ideas for this draft. Returns: Dict with keys: total_before, total_after, merged_count, examples. """ import numpy as np import ollama as ollama_lib client = ollama_lib.Client(host=self.config.ollama_url) # Get list of drafts to process if draft_name: draft_names = [draft_name] else: rows = self.db.conn.execute( "SELECT DISTINCT draft_name FROM ideas ORDER BY draft_name" ).fetchall() draft_names = [r["draft_name"] for r in rows] total_before = 0 merged_count = 0 examples = [] ids_to_delete = [] for dname in draft_names: ideas = self.db.get_ideas_for_draft(dname) if len(ideas) < 2: total_before += len(ideas) continue total_before += len(ideas) # Embed each idea: "title: description" texts = [f"{idea['title']}: {idea['description']}" for idea in ideas] try: resp = client.embed( model=self.config.ollama_embed_model, input=texts ) vectors = [ np.array(v, dtype=np.float32) for v in resp["embeddings"] ] except Exception as e: console.print(f"[red]Failed to embed ideas for {dname}: {e}[/]") continue # Track which ideas are already marked for deletion in this draft deleted_in_draft = set() # Compare all pairs within this draft for i in range(len(ideas)): if ideas[i]["id"] in deleted_in_draft: continue for j in range(i + 1, len(ideas)): if ideas[j]["id"] in deleted_in_draft: continue # Cosine similarity dot = np.dot(vectors[i], vectors[j]) norm = np.linalg.norm(vectors[i]) * np.linalg.norm(vectors[j]) sim = float(dot / norm) if norm > 0 else 0.0 if sim >= threshold: # Keep the idea with the longer description keep = ideas[i] if len(ideas[i]["description"]) >= len(ideas[j]["description"]) else ideas[j] drop = ideas[j] if keep is ideas[i] else ideas[i] ids_to_delete.append(drop["id"]) deleted_in_draft.add(drop["id"]) merged_count += 1 if len(examples) < 20: examples.append({ "draft": dname, "keep": keep["title"], "drop": drop["title"], "similarity": round(sim, 3), }) if not dry_run: for idea_id in ids_to_delete: self.db.delete_idea(idea_id) total_after = total_before - merged_count return { "total_before": total_before, "total_after": total_after, "merged_count": merged_count, "examples": examples, } def score_idea_novelty(self, batch_size: int = 20, cheap: bool = True) -> dict: """Score all unscored ideas for novelty (1-5) using Claude. Args: batch_size: Number of ideas per API call (default 20). cheap: Use Haiku model for lower cost (default True). Returns: Dict with keys: scored_count, avg_score, distribution. """ unscored = self.db.ideas_with_drafts(unscored_only=True) if not unscored: console.print("All ideas already scored.") return {"scored_count": 0, "avg_score": 0.0, "distribution": {}} model_label = "Haiku" if cheap else "Sonnet" console.print( f"Scoring [bold]{len(unscored)}[/] ideas for novelty " f"(batches of {batch_size}, {model_label})..." ) scored_count = 0 all_scores: list[int] = [] with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), console=console, ) as progress: task = progress.add_task("Scoring novelty...", total=len(unscored)) for i in range(0, len(unscored), batch_size): batch = unscored[i:i + batch_size] progress.update(task, description=f"Batch {i // batch_size + 1}") # Build ideas block for prompt ideas_block = "" for idea in batch: ideas_block += ( f"\n---\nID: {idea['id']}\n" f"Draft: {idea['draft_title']}\n" f"Idea: {idea['title']}\n" f"Description: {idea['description']}\n" ) prompt = SCORE_NOVELTY_PROMPT.format(ideas_block=ideas_block) phash = _prompt_hash(prompt) # Check cache cached = self.db.get_cached_response("_novelty_score_", phash) if cached: try: scores = json.loads(cached) if isinstance(scores, dict): batch_scores = {int(k): int(v) for k, v in scores.items()} self.db.update_idea_scores_bulk(batch_scores) scored_count += len(batch_scores) all_scores.extend(batch_scores.values()) progress.advance(task, advance=len(batch)) continue except (json.JSONDecodeError, KeyError, ValueError): pass try: text, in_tok, out_tok = self._call_claude( prompt, max_tokens=50 * len(batch), cheap=cheap ) text = self._extract_json(text) scores = json.loads(text) if not isinstance(scores, dict): console.print(f"[red]Batch {i // batch_size + 1}: unexpected response format[/]") progress.advance(task, advance=len(batch)) continue # Cache the raw response self.db.cache_response( "_novelty_score_", phash, self.config.claude_model_cheap if cheap else self.config.claude_model, prompt, text, in_tok, out_tok, ) # Parse and store scores batch_scores = {} for k, v in scores.items(): try: idea_id = int(k) score = max(1, min(5, int(v))) batch_scores[idea_id] = score except (ValueError, TypeError): continue self.db.update_idea_scores_bulk(batch_scores) scored_count += len(batch_scores) all_scores.extend(batch_scores.values()) except (json.JSONDecodeError, anthropic.APIError) as e: console.print(f"[red]Batch {i // batch_size + 1} failed: {e}[/]") progress.advance(task, advance=len(batch)) # Build distribution distribution: dict[int, int] = {} for s in all_scores: distribution[s] = distribution.get(s, 0) + 1 avg = sum(all_scores) / len(all_scores) if all_scores else 0.0 in_tok, out_tok = self.db.total_tokens_used() console.print( f"Scored [bold green]{scored_count}[/] ideas " f"(avg: {avg:.1f}) | Tokens: {in_tok:,} in + {out_tok:,} out" ) return {"scored_count": scored_count, "avg_score": round(avg, 2), "distribution": distribution}