#!/usr/bin/env python3 """Compare Ollama classifier vs Claude ratings to find disagreements.""" import sqlite3 import sys sys.path.insert(0, "src") from ietf_analyzer.classifier import Classifier from ietf_analyzer.config import Config cfg = Config.load() conn = sqlite3.connect(cfg.db_path) conn.row_factory = sqlite3.Row # Get all rated drafts with their Claude ratings rows = conn.execute(""" SELECT d.name, d.title, d.abstract, r.relevance, r.false_positive, r.novelty, r.maturity, r.overlap, r.momentum, (r.novelty + r.maturity + (5 - r.overlap) + r.momentum + r.relevance) / 5.0 as composite FROM drafts d JOIN ratings r ON d.name = r.draft_name WHERE d.abstract IS NOT NULL AND d.abstract != '' ORDER BY d.name """).fetchall() print(f"Comparing Ollama classifier vs Claude ratings on {len(rows)} drafts...\n") with Classifier(cfg) as clf: agree = 0 disagree_ollama_yes_claude_no = [] # Ollama says relevant, Claude says FP disagree_ollama_no_claude_yes = [] # Ollama says irrelevant, Claude says relevant for i, r in enumerate(rows): is_rel, sim, method = clf.classify(r["title"], r["abstract"]) # Claude's view: false_positive=1 OR relevance<=2 means "not really relevant" claude_relevant = not r["false_positive"] and r["relevance"] >= 3 if is_rel == claude_relevant: agree += 1 elif is_rel and not claude_relevant: disagree_ollama_yes_claude_no.append({ "name": r["name"], "title": r["title"][:60], "sim": sim, "method": method, "relevance": r["relevance"], "fp": r["false_positive"], "composite": r["composite"], }) else: disagree_ollama_no_claude_yes.append({ "name": r["name"], "title": r["title"][:60], "sim": sim, "method": method, "relevance": r["relevance"], "fp": r["false_positive"], "composite": r["composite"], }) if (i + 1) % 50 == 0: print(f" Processed {i+1}/{len(rows)}...") print(f"\n{'='*70}") print(f"AGREEMENT: {agree}/{len(rows)} ({100*agree/len(rows):.1f}%)") print(f"{'='*70}") print(f"\nOllama=RELEVANT but Claude=NOT relevant ({len(disagree_ollama_yes_claude_no)}):") print(f" (These are cases where Ollama wastes Claude tokens on irrelevant drafts)") for d in sorted(disagree_ollama_yes_claude_no, key=lambda x: x["sim"], reverse=True)[:15]: fp_label = " [FP]" if d["fp"] else "" print(f" sim={d['sim']:.3f} ({d['method']:18s}) rel={d['relevance']}{fp_label} | {d['name']}") print(f" {d['title']}") print(f"\nOllama=IRRELEVANT but Claude=RELEVANT ({len(disagree_ollama_no_claude_yes)}):") print(f" (These are cases where Ollama would have incorrectly filtered out good drafts)") for d in sorted(disagree_ollama_no_claude_yes, key=lambda x: x["relevance"], reverse=True)[:15]: print(f" sim={d['sim']:.3f} ({d['method']:18s}) rel={d['relevance']} comp={d['composite']:.1f} | {d['name']}") print(f" {d['title']}") # Summary stats total_fp_by_claude = sum(1 for r in rows if r["false_positive"] or r["relevance"] <= 2) total_relevant_by_claude = len(rows) - total_fp_by_claude print(f"\n{'='*70}") print(f"Claude thinks: {total_relevant_by_claude} relevant, {total_fp_by_claude} not relevant") print(f"Ollama would let through: {agree + len(disagree_ollama_yes_claude_no) - len(disagree_ollama_no_claude_yes)} (saves {len(disagree_ollama_no_claude_yes) - len(disagree_ollama_yes_claude_no)} Claude calls)") print(f"\nToken savings if Ollama pre-filters:") print(f" Correctly rejected: {agree - total_relevant_by_claude + len(rows) - agree - len(disagree_ollama_yes_claude_no)} drafts") print(f" Incorrectly rejected (missed): {len(disagree_ollama_no_claude_yes)} drafts") print(f" Incorrectly passed (wasted): {len(disagree_ollama_yes_claude_no)} drafts") conn.close()