#!/usr/bin/env python3 """Compare Claude Haiku vs Ollama as pre-classifiers, using Claude Sonnet ratings as ground truth.""" import sqlite3 import hashlib import json import sys import time sys.path.insert(0, "src") import anthropic from ietf_analyzer.config import Config cfg = Config.load() conn = sqlite3.connect(cfg.db_path) conn.row_factory = sqlite3.Row HAIKU_PROMPT = """\ You are classifying IETF Internet-Drafts for an AI/agent standards tracker. A draft is RELEVANT if it relates to ANY of these topics: - AI agents, autonomous agents, multi-agent systems - Agent identity, authentication, authorization, discovery - Agent-to-agent (A2A) communication protocols - Large language models (LLMs), generative AI - Machine learning in network operations - AI safety, alignment, trustworthiness - Model Context Protocol (MCP), agentic workflows - OAuth/JWT/credentials for agents or AI systems - Autonomous network operations using AI - Intelligent network management or traffic handling A draft is NOT relevant if it only covers: - Pure cryptography without AI/agent context - General networking protocols (BGP, DNS, TLS) without AI - Email, HTTP, or web standards without AI/agent features - Remote attestation (RATS) unless specifically for AI agents - Accessibility guidelines for user agents (browsers) Title: {title} Abstract: {abstract} Is this draft relevant to AI agents or related topics? Answer ONLY "yes" or "no".""" client = anthropic.Anthropic() def haiku_classify(title, abstract): """Classify with Haiku, using llm_cache to avoid repeat calls.""" prompt = HAIKU_PROMPT.format(title=title, abstract=abstract[:2000]) cache_key = hashlib.sha256(f"haiku-classify:{prompt}".encode()).hexdigest() # Check cache cached = conn.execute("SELECT response_json FROM llm_cache WHERE prompt_hash=?", (cache_key,)).fetchone() if cached: return cached["response_json"].strip().lower().startswith("yes"), True resp = client.messages.create( model=cfg.claude_model_cheap, max_tokens=10, messages=[{"role": "user", "content": prompt}], ) answer = resp.content[0].text.strip().lower() # Cache it conn.execute( "INSERT OR REPLACE INTO llm_cache (draft_name, prompt_hash, request_json, response_json, model, input_tokens, output_tokens) VALUES (?,?,?,?,?,?,?)", ("_classify_", cache_key, prompt[:500], answer, cfg.claude_model_cheap, resp.usage.input_tokens, resp.usage.output_tokens), ) conn.commit() return answer.startswith("yes"), False # Get all rated drafts rows = conn.execute(""" SELECT d.name, d.title, d.abstract, r.relevance, r.false_positive FROM drafts d JOIN ratings r ON d.name = r.draft_name WHERE d.abstract IS NOT NULL AND d.abstract != '' ORDER BY d.name """).fetchall() print(f"Classifying {len(rows)} drafts with Haiku...\n") haiku_agree = 0 haiku_fp = [] # Haiku=yes, Claude=no haiku_fn = [] # Haiku=no, Claude=yes total_tokens_in = 0 total_tokens_out = 0 cached_count = 0 api_count = 0 for i, r in enumerate(rows): claude_relevant = not r["false_positive"] and r["relevance"] >= 3 haiku_relevant, was_cached = haiku_classify(r["title"], r["abstract"]) if was_cached: cached_count += 1 else: api_count += 1 if api_count % 20 == 0: time.sleep(1) # rate limit if haiku_relevant == claude_relevant: haiku_agree += 1 elif haiku_relevant and not claude_relevant: haiku_fp.append({"name": r["name"], "title": r["title"][:60], "rel": r["relevance"], "fp": r["false_positive"]}) else: haiku_fn.append({"name": r["name"], "title": r["title"][:60], "rel": r["relevance"], "fp": r["false_positive"]}) if (i + 1) % 50 == 0: print(f" Processed {i+1}/{len(rows)} ({cached_count} cached, {api_count} API calls)...") print(f"\n{'='*70}") print(f"HAIKU AGREEMENT with Claude Sonnet: {haiku_agree}/{len(rows)} ({100*haiku_agree/len(rows):.1f}%)") print(f"API calls: {api_count}, Cached: {cached_count}") print(f"{'='*70}") print(f"\nHaiku=RELEVANT but Sonnet=NOT ({len(haiku_fp)}):") for d in haiku_fp[:10]: fp = " [FP]" if d["fp"] else "" print(f" rel={d['rel']}{fp} | {d['name']}: {d['title']}") print(f"\nHaiku=IRRELEVANT but Sonnet=RELEVANT ({len(haiku_fn)}):") for d in haiku_fn[:10]: print(f" rel={d['rel']} | {d['name']}: {d['title']}") # Cost estimate avg_tokens_per_call = 800 # ~800 input tokens per classification cost_per_draft = (avg_tokens_per_call * 0.80 + 50 * 4.0) / 1_000_000 # Haiku pricing print(f"\n{'='*70}") print(f"Cost estimate: ~${cost_per_draft:.5f}/draft = ~${cost_per_draft * len(rows):.3f} for {len(rows)} drafts") print(f"Ollama cost: $0 (but 66.9% agreement)") print(f"Haiku cost: ~${cost_per_draft * len(rows):.3f} ({100*haiku_agree/len(rows):.1f}% agreement)") conn.close()