Run pipeline, write Post 08, commit untracked files
Pipeline: - Extract ideas for 38 new drafts → 462 ideas total - Convergence analysis: 132 cross-org convergent ideas (33% rate) - Fetch authors for 102 drafts → 709 authors (up from 403) - Refresh gap analysis: 12 gaps across full 474-draft corpus - Update verified counts with new totals Post 08: - Complete rewrite of "Agents Building the Agent Analysis" (2,953 words) - Covers 3 phases: writing team → review cycle → fix cycle - Meta-irony table mapping team coordination to IETF gap names - Specific examples from dev journal (SQL injection, consent conflation, ideas mismatch) Untracked files committed: - scripts/: backfill-wg-names, classify-unrated, compare-classifiers, download-relevant-text, run-webui - src/ietf_analyzer/classifier.py: two-stage Ollama classifier - src/webui/: analytics (GDPR-compliant), auth, obsidian_export - tests/test_obsidian_export.py (10 tests) - data/reports/: wg-analysis, generated draft for gap #37 Housekeeping: - .gitignore: exclude LaTeX artifacts, stale DBs, analytics.db Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
86
scripts/compare-classifiers.py
Normal file
86
scripts/compare-classifiers.py
Normal file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare Ollama classifier vs Claude ratings to find disagreements."""
|
||||
|
||||
import sqlite3
|
||||
import sys
|
||||
sys.path.insert(0, "src")
|
||||
|
||||
from ietf_analyzer.classifier import Classifier
|
||||
from ietf_analyzer.config import Config
|
||||
|
||||
cfg = Config.load()
|
||||
conn = sqlite3.connect(cfg.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
# Get all rated drafts with their Claude ratings
|
||||
rows = conn.execute("""
|
||||
SELECT d.name, d.title, d.abstract, r.relevance, r.false_positive,
|
||||
r.novelty, r.maturity, r.overlap, r.momentum,
|
||||
(r.novelty + r.maturity + (5 - r.overlap) + r.momentum + r.relevance) / 5.0 as composite
|
||||
FROM drafts d JOIN ratings r ON d.name = r.draft_name
|
||||
WHERE d.abstract IS NOT NULL AND d.abstract != ''
|
||||
ORDER BY d.name
|
||||
""").fetchall()
|
||||
|
||||
print(f"Comparing Ollama classifier vs Claude ratings on {len(rows)} drafts...\n")
|
||||
|
||||
with Classifier(cfg) as clf:
|
||||
agree = 0
|
||||
disagree_ollama_yes_claude_no = [] # Ollama says relevant, Claude says FP
|
||||
disagree_ollama_no_claude_yes = [] # Ollama says irrelevant, Claude says relevant
|
||||
|
||||
for i, r in enumerate(rows):
|
||||
is_rel, sim, method = clf.classify(r["title"], r["abstract"])
|
||||
|
||||
# Claude's view: false_positive=1 OR relevance<=2 means "not really relevant"
|
||||
claude_relevant = not r["false_positive"] and r["relevance"] >= 3
|
||||
|
||||
if is_rel == claude_relevant:
|
||||
agree += 1
|
||||
elif is_rel and not claude_relevant:
|
||||
disagree_ollama_yes_claude_no.append({
|
||||
"name": r["name"], "title": r["title"][:60],
|
||||
"sim": sim, "method": method,
|
||||
"relevance": r["relevance"], "fp": r["false_positive"],
|
||||
"composite": r["composite"],
|
||||
})
|
||||
else:
|
||||
disagree_ollama_no_claude_yes.append({
|
||||
"name": r["name"], "title": r["title"][:60],
|
||||
"sim": sim, "method": method,
|
||||
"relevance": r["relevance"], "fp": r["false_positive"],
|
||||
"composite": r["composite"],
|
||||
})
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" Processed {i+1}/{len(rows)}...")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"AGREEMENT: {agree}/{len(rows)} ({100*agree/len(rows):.1f}%)")
|
||||
print(f"{'='*70}")
|
||||
|
||||
print(f"\nOllama=RELEVANT but Claude=NOT relevant ({len(disagree_ollama_yes_claude_no)}):")
|
||||
print(f" (These are cases where Ollama wastes Claude tokens on irrelevant drafts)")
|
||||
for d in sorted(disagree_ollama_yes_claude_no, key=lambda x: x["sim"], reverse=True)[:15]:
|
||||
fp_label = " [FP]" if d["fp"] else ""
|
||||
print(f" sim={d['sim']:.3f} ({d['method']:18s}) rel={d['relevance']}{fp_label} | {d['name']}")
|
||||
print(f" {d['title']}")
|
||||
|
||||
print(f"\nOllama=IRRELEVANT but Claude=RELEVANT ({len(disagree_ollama_no_claude_yes)}):")
|
||||
print(f" (These are cases where Ollama would have incorrectly filtered out good drafts)")
|
||||
for d in sorted(disagree_ollama_no_claude_yes, key=lambda x: x["relevance"], reverse=True)[:15]:
|
||||
print(f" sim={d['sim']:.3f} ({d['method']:18s}) rel={d['relevance']} comp={d['composite']:.1f} | {d['name']}")
|
||||
print(f" {d['title']}")
|
||||
|
||||
# Summary stats
|
||||
total_fp_by_claude = sum(1 for r in rows if r["false_positive"] or r["relevance"] <= 2)
|
||||
total_relevant_by_claude = len(rows) - total_fp_by_claude
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Claude thinks: {total_relevant_by_claude} relevant, {total_fp_by_claude} not relevant")
|
||||
print(f"Ollama would let through: {agree + len(disagree_ollama_yes_claude_no) - len(disagree_ollama_no_claude_yes)} (saves {len(disagree_ollama_no_claude_yes) - len(disagree_ollama_yes_claude_no)} Claude calls)")
|
||||
print(f"\nToken savings if Ollama pre-filters:")
|
||||
print(f" Correctly rejected: {agree - total_relevant_by_claude + len(rows) - agree - len(disagree_ollama_yes_claude_no)} drafts")
|
||||
print(f" Incorrectly rejected (missed): {len(disagree_ollama_no_claude_yes)} drafts")
|
||||
print(f" Incorrectly passed (wasted): {len(disagree_ollama_yes_claude_no)} drafts")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user