ietf-draft-analyzer/scripts/compare-haiku-classifier.py

#!/usr/bin/env python3
"""Compare Claude Haiku vs Ollama as pre-classifiers, using Claude Sonnet ratings as ground truth."""

import sqlite3
import hashlib
import json
import sys
import time
sys.path.insert(0, "src")

import anthropic
from ietf_analyzer.config import Config

cfg = Config.load()
conn = sqlite3.connect(cfg.db_path)
conn.row_factory = sqlite3.Row

HAIKU_PROMPT = """\
You are classifying IETF Internet-Drafts for an AI/agent standards tracker.

A draft is RELEVANT if it relates to ANY of these topics:
- AI agents, autonomous agents, multi-agent systems
- Agent identity, authentication, authorization, discovery
- Agent-to-agent (A2A) communication protocols
- Large language models (LLMs), generative AI
- Machine learning in network operations
- AI safety, alignment, trustworthiness
- Model Context Protocol (MCP), agentic workflows
- OAuth/JWT/credentials for agents or AI systems
- Autonomous network operations using AI
- Intelligent network management or traffic handling

A draft is NOT relevant if it only covers:
- Pure cryptography without AI/agent context
- General networking protocols (BGP, DNS, TLS) without AI
- Email, HTTP, or web standards without AI/agent features
- Remote attestation (RATS) unless specifically for AI agents
- Accessibility guidelines for user agents (browsers)

Title: {title}

Abstract: {abstract}

Is this draft relevant to AI agents or related topics? Answer ONLY "yes" or "no"."""

client = anthropic.Anthropic()

def haiku_classify(title, abstract):
    """Classify with Haiku, using llm_cache to avoid repeat calls."""
    prompt = HAIKU_PROMPT.format(title=title, abstract=abstract[:2000])
    cache_key = hashlib.sha256(f"haiku-classify:{prompt}".encode()).hexdigest()

    # Check cache
    cached = conn.execute("SELECT response_json FROM llm_cache WHERE prompt_hash=?", (cache_key,)).fetchone()
    if cached:
        return cached["response_json"].strip().lower().startswith("yes"), True

    resp = client.messages.create(
        model=cfg.claude_model_cheap,
        max_tokens=10,
        messages=[{"role": "user", "content": prompt}],
    )
    answer = resp.content[0].text.strip().lower()

    # Cache it
    conn.execute(
        "INSERT OR REPLACE INTO llm_cache (draft_name, prompt_hash, request_json, response_json, model, input_tokens, output_tokens) VALUES (?,?,?,?,?,?,?)",
        ("_classify_", cache_key, prompt[:500], answer, cfg.claude_model_cheap, resp.usage.input_tokens, resp.usage.output_tokens),
    )
    conn.commit()

    return answer.startswith("yes"), False

# Get all rated drafts
rows = conn.execute("""
    SELECT d.name, d.title, d.abstract, r.relevance, r.false_positive
    FROM drafts d JOIN ratings r ON d.name = r.draft_name
    WHERE d.abstract IS NOT NULL AND d.abstract != ''
    ORDER BY d.name
""").fetchall()

print(f"Classifying {len(rows)} drafts with Haiku...\n")

haiku_agree = 0
haiku_fp = []  # Haiku=yes, Claude=no
haiku_fn = []  # Haiku=no, Claude=yes
total_tokens_in = 0
total_tokens_out = 0
cached_count = 0
api_count = 0

for i, r in enumerate(rows):
    claude_relevant = not r["false_positive"] and r["relevance"] >= 3
    haiku_relevant, was_cached = haiku_classify(r["title"], r["abstract"])

    if was_cached:
        cached_count += 1
    else:
        api_count += 1
        if api_count % 20 == 0:
            time.sleep(1)  # rate limit

    if haiku_relevant == claude_relevant:
        haiku_agree += 1
    elif haiku_relevant and not claude_relevant:
        haiku_fp.append({"name": r["name"], "title": r["title"][:60], "rel": r["relevance"], "fp": r["false_positive"]})
    else:
        haiku_fn.append({"name": r["name"], "title": r["title"][:60], "rel": r["relevance"], "fp": r["false_positive"]})

    if (i + 1) % 50 == 0:
        print(f"  Processed {i+1}/{len(rows)} ({cached_count} cached, {api_count} API calls)...")

print(f"\n{'='*70}")
print(f"HAIKU AGREEMENT with Claude Sonnet: {haiku_agree}/{len(rows)} ({100*haiku_agree/len(rows):.1f}%)")
print(f"API calls: {api_count}, Cached: {cached_count}")
print(f"{'='*70}")

print(f"\nHaiku=RELEVANT but Sonnet=NOT ({len(haiku_fp)}):")
for d in haiku_fp[:10]:
    fp = " [FP]" if d["fp"] else ""
    print(f"  rel={d['rel']}{fp} | {d['name']}: {d['title']}")

print(f"\nHaiku=IRRELEVANT but Sonnet=RELEVANT ({len(haiku_fn)}):")
for d in haiku_fn[:10]:
    print(f"  rel={d['rel']} | {d['name']}: {d['title']}")

# Cost estimate
avg_tokens_per_call = 800  # ~800 input tokens per classification
cost_per_draft = (avg_tokens_per_call * 0.80 + 50 * 4.0) / 1_000_000  # Haiku pricing
print(f"\n{'='*70}")
print(f"Cost estimate: ~${cost_per_draft:.5f}/draft = ~${cost_per_draft * len(rows):.3f} for {len(rows)} drafts")
print(f"Ollama cost: $0 (but 66.9% agreement)")
print(f"Haiku cost: ~${cost_per_draft * len(rows):.3f} ({100*haiku_agree/len(rows):.1f}% agreement)")

conn.close()