Idea quality pipeline, web UI features, academic paper
- Tighten idea extraction prompts (1-4 ideas, no sub-features) reducing 1,907 ideas to 468 across 434 drafts (78% reduction) - Add embedding-based dedup (ietf dedup-ideas) for same-draft similarity - Add novelty scoring (ietf ideas score) and filtering (ietf ideas filter) using Claude to rate ideas 1-5, removing 49 generic building blocks - Final count: 419 high-quality ideas (avg 1.1/draft) - Web UI: gap explorer with live draft generation and pre-generated demos - Web UI: D3.js author collaboration network (498 nodes, 1142 edges, 68 clusters, org filtering, interactive zoom/pan) - Academic paper: 15-page LaTeX workshop paper analyzing the 434-draft AI agent standards landscape - Save improvement ideas backlog to data/reports/improvement-ideas.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -77,7 +77,7 @@ Abstract: {abstract}
|
||||
|
||||
{text_excerpt}
|
||||
|
||||
Return 0-8 ideas. Only include CONCRETE, NOVEL technical contributions — not restatements of the abstract or general goals. If the draft has no substantive technical ideas (e.g. it is a problem statement, administrative document, or off-topic), return an empty array [].
|
||||
Return 1-4 ideas. Extract only TOP-LEVEL novel contributions. Do NOT list sub-features, optimizations, variants, or extensions as separate ideas. If a draft defines one protocol with multiple features, that is ONE idea, not several. Each idea must be independently novel — could it be its own draft? If not, merge it with the parent idea. Only include CONCRETE, NOVEL technical contributions — not restatements of the abstract or general goals. If the draft has no substantive technical ideas (e.g. it is a problem statement, administrative document, or off-topic), return an empty array [].
|
||||
JSON array only, no fences."""
|
||||
|
||||
BATCH_IDEAS_PROMPT = """\
|
||||
@@ -86,7 +86,7 @@ Per idea: {{"title":"short name","description":"1 sentence","type":"mechanism|pr
|
||||
|
||||
{drafts_block}
|
||||
|
||||
0-8 ideas per draft. Only include CONCRETE, NOVEL technical contributions. If a draft has no substantive ideas, map it to an empty array. Do not pad with restatements of the abstract.
|
||||
1-4 ideas per draft. Extract only TOP-LEVEL novel contributions. Do NOT list sub-features, optimizations, variants, or extensions as separate ideas. If a draft defines one protocol with multiple features, that is ONE idea, not several. Each idea must be independently novel — could it be its own draft? If not, merge it with the parent idea. Only include CONCRETE, NOVEL technical contributions. If a draft has no substantive ideas, map it to an empty array. Do not pad with restatements of the abstract.
|
||||
Return ONLY a JSON object like {{"draft-name":[...], ...}}, no fences."""
|
||||
|
||||
GAP_ANALYSIS_PROMPT = """\
|
||||
@@ -115,6 +115,21 @@ Focus on:
|
||||
|
||||
JSON array only, no fences."""
|
||||
|
||||
SCORE_NOVELTY_PROMPT = """\
|
||||
Rate each idea's novelty/originality on a 1-5 scale.
|
||||
|
||||
1 = Generic building block anyone would include (e.g. "Agent Gateway", "Certificate Authority")
|
||||
2 = Obvious extension of existing work, minimal originality
|
||||
3 = Useful and relevant but expected given the problem space
|
||||
4 = Interesting contribution with some original thinking
|
||||
5 = Genuinely novel mechanism, protocol, or architectural insight
|
||||
|
||||
Ideas to score:
|
||||
{ideas_block}
|
||||
|
||||
Return ONLY a JSON object mapping idea ID to score, like {{"123": 3, "456": 1, ...}}.
|
||||
No fences, no explanation."""
|
||||
|
||||
|
||||
def _prompt_hash(text: str) -> str:
|
||||
return hashlib.sha256(text.encode()).hexdigest()[:16]
|
||||
@@ -558,3 +573,222 @@ class Analyzer:
|
||||
return text
|
||||
except anthropic.APIError as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
def dedup_ideas(self, threshold: float = 0.85, dry_run: bool = True,
|
||||
draft_name: str | None = None) -> dict:
|
||||
"""Deduplicate ideas within each draft using embedding similarity.
|
||||
|
||||
For each draft, computes pairwise cosine similarity of idea embeddings.
|
||||
Ideas above the threshold are merged (keeping the one with the longer
|
||||
description).
|
||||
|
||||
Args:
|
||||
threshold: Cosine similarity threshold for merging (default 0.85).
|
||||
dry_run: If True, report what would be merged without deleting.
|
||||
draft_name: If provided, only dedup ideas for this draft.
|
||||
|
||||
Returns:
|
||||
Dict with keys: total_before, total_after, merged_count, examples.
|
||||
"""
|
||||
import numpy as np
|
||||
import ollama as ollama_lib
|
||||
|
||||
client = ollama_lib.Client(host=self.config.ollama_url)
|
||||
|
||||
# Get list of drafts to process
|
||||
if draft_name:
|
||||
draft_names = [draft_name]
|
||||
else:
|
||||
rows = self.db.conn.execute(
|
||||
"SELECT DISTINCT draft_name FROM ideas ORDER BY draft_name"
|
||||
).fetchall()
|
||||
draft_names = [r["draft_name"] for r in rows]
|
||||
|
||||
total_before = 0
|
||||
merged_count = 0
|
||||
examples = []
|
||||
ids_to_delete = []
|
||||
|
||||
for dname in draft_names:
|
||||
ideas = self.db.get_ideas_for_draft(dname)
|
||||
if len(ideas) < 2:
|
||||
total_before += len(ideas)
|
||||
continue
|
||||
|
||||
total_before += len(ideas)
|
||||
|
||||
# Embed each idea: "title: description"
|
||||
texts = [f"{idea['title']}: {idea['description']}" for idea in ideas]
|
||||
try:
|
||||
resp = client.embed(
|
||||
model=self.config.ollama_embed_model, input=texts
|
||||
)
|
||||
vectors = [
|
||||
np.array(v, dtype=np.float32)
|
||||
for v in resp["embeddings"]
|
||||
]
|
||||
except Exception as e:
|
||||
console.print(f"[red]Failed to embed ideas for {dname}: {e}[/]")
|
||||
continue
|
||||
|
||||
# Track which ideas are already marked for deletion in this draft
|
||||
deleted_in_draft = set()
|
||||
|
||||
# Compare all pairs within this draft
|
||||
for i in range(len(ideas)):
|
||||
if ideas[i]["id"] in deleted_in_draft:
|
||||
continue
|
||||
for j in range(i + 1, len(ideas)):
|
||||
if ideas[j]["id"] in deleted_in_draft:
|
||||
continue
|
||||
|
||||
# Cosine similarity
|
||||
dot = np.dot(vectors[i], vectors[j])
|
||||
norm = np.linalg.norm(vectors[i]) * np.linalg.norm(vectors[j])
|
||||
sim = float(dot / norm) if norm > 0 else 0.0
|
||||
|
||||
if sim >= threshold:
|
||||
# Keep the idea with the longer description
|
||||
keep = ideas[i] if len(ideas[i]["description"]) >= len(ideas[j]["description"]) else ideas[j]
|
||||
drop = ideas[j] if keep is ideas[i] else ideas[i]
|
||||
|
||||
ids_to_delete.append(drop["id"])
|
||||
deleted_in_draft.add(drop["id"])
|
||||
merged_count += 1
|
||||
|
||||
if len(examples) < 20:
|
||||
examples.append({
|
||||
"draft": dname,
|
||||
"keep": keep["title"],
|
||||
"drop": drop["title"],
|
||||
"similarity": round(sim, 3),
|
||||
})
|
||||
|
||||
if not dry_run:
|
||||
for idea_id in ids_to_delete:
|
||||
self.db.delete_idea(idea_id)
|
||||
|
||||
total_after = total_before - merged_count
|
||||
return {
|
||||
"total_before": total_before,
|
||||
"total_after": total_after,
|
||||
"merged_count": merged_count,
|
||||
"examples": examples,
|
||||
}
|
||||
|
||||
def score_idea_novelty(self, batch_size: int = 20, cheap: bool = True) -> dict:
|
||||
"""Score all unscored ideas for novelty (1-5) using Claude.
|
||||
|
||||
Args:
|
||||
batch_size: Number of ideas per API call (default 20).
|
||||
cheap: Use Haiku model for lower cost (default True).
|
||||
|
||||
Returns:
|
||||
Dict with keys: scored_count, avg_score, distribution.
|
||||
"""
|
||||
unscored = self.db.ideas_with_drafts(unscored_only=True)
|
||||
if not unscored:
|
||||
console.print("All ideas already scored.")
|
||||
return {"scored_count": 0, "avg_score": 0.0, "distribution": {}}
|
||||
|
||||
model_label = "Haiku" if cheap else "Sonnet"
|
||||
console.print(
|
||||
f"Scoring [bold]{len(unscored)}[/] ideas for novelty "
|
||||
f"(batches of {batch_size}, {model_label})..."
|
||||
)
|
||||
|
||||
scored_count = 0
|
||||
all_scores: list[int] = []
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
MofNCompleteColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Scoring novelty...", total=len(unscored))
|
||||
|
||||
for i in range(0, len(unscored), batch_size):
|
||||
batch = unscored[i:i + batch_size]
|
||||
progress.update(task, description=f"Batch {i // batch_size + 1}")
|
||||
|
||||
# Build ideas block for prompt
|
||||
ideas_block = ""
|
||||
for idea in batch:
|
||||
ideas_block += (
|
||||
f"\n---\nID: {idea['id']}\n"
|
||||
f"Draft: {idea['draft_title']}\n"
|
||||
f"Idea: {idea['title']}\n"
|
||||
f"Description: {idea['description']}\n"
|
||||
)
|
||||
|
||||
prompt = SCORE_NOVELTY_PROMPT.format(ideas_block=ideas_block)
|
||||
phash = _prompt_hash(prompt)
|
||||
|
||||
# Check cache
|
||||
cached = self.db.get_cached_response("_novelty_score_", phash)
|
||||
if cached:
|
||||
try:
|
||||
scores = json.loads(cached)
|
||||
if isinstance(scores, dict):
|
||||
batch_scores = {int(k): int(v) for k, v in scores.items()}
|
||||
self.db.update_idea_scores_bulk(batch_scores)
|
||||
scored_count += len(batch_scores)
|
||||
all_scores.extend(batch_scores.values())
|
||||
progress.advance(task, advance=len(batch))
|
||||
continue
|
||||
except (json.JSONDecodeError, KeyError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
text, in_tok, out_tok = self._call_claude(
|
||||
prompt, max_tokens=50 * len(batch), cheap=cheap
|
||||
)
|
||||
text = self._extract_json(text)
|
||||
scores = json.loads(text)
|
||||
|
||||
if not isinstance(scores, dict):
|
||||
console.print(f"[red]Batch {i // batch_size + 1}: unexpected response format[/]")
|
||||
progress.advance(task, advance=len(batch))
|
||||
continue
|
||||
|
||||
# Cache the raw response
|
||||
self.db.cache_response(
|
||||
"_novelty_score_", phash,
|
||||
self.config.claude_model_cheap if cheap else self.config.claude_model,
|
||||
prompt, text, in_tok, out_tok,
|
||||
)
|
||||
|
||||
# Parse and store scores
|
||||
batch_scores = {}
|
||||
for k, v in scores.items():
|
||||
try:
|
||||
idea_id = int(k)
|
||||
score = max(1, min(5, int(v)))
|
||||
batch_scores[idea_id] = score
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
self.db.update_idea_scores_bulk(batch_scores)
|
||||
scored_count += len(batch_scores)
|
||||
all_scores.extend(batch_scores.values())
|
||||
|
||||
except (json.JSONDecodeError, anthropic.APIError) as e:
|
||||
console.print(f"[red]Batch {i // batch_size + 1} failed: {e}[/]")
|
||||
|
||||
progress.advance(task, advance=len(batch))
|
||||
|
||||
# Build distribution
|
||||
distribution: dict[int, int] = {}
|
||||
for s in all_scores:
|
||||
distribution[s] = distribution.get(s, 0) + 1
|
||||
|
||||
avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
||||
|
||||
in_tok, out_tok = self.db.total_tokens_used()
|
||||
console.print(
|
||||
f"Scored [bold green]{scored_count}[/] ideas "
|
||||
f"(avg: {avg:.1f}) | Tokens: {in_tok:,} in + {out_tok:,} out"
|
||||
)
|
||||
return {"scored_count": scored_count, "avg_score": round(avg, 2), "distribution": distribution}
|
||||
|
||||
Reference in New Issue
Block a user