v0.3.0: Publication-ready release with blog site, paper update, and polish
Release prep: - Version bump to 0.3.0 (pyproject.toml, cli.py) - Rewrite README.md with current stats (475 drafts, 713 authors, 501 ideas) - Add CONTRIBUTING.md with dev setup and code conventions Blog site: - Add scripts/build-site.py (markdown → HTML with clean CSS, dark mode, nav) - Generate static site in docs/blog/ (10 pages) - Ready for GitHub Pages deployment Academic paper (paper/main.tex): - Update all counts: 474→475 drafts, 557→710 authors, 1907→462 ideas, 11→12 gaps - Add false-positive filtering methodology (113 excluded, 361 relevant) - Add cross-org convergence analysis (132 ideas, 33% rate) - Add GDPR compliance gap to gap table - Add LLM-as-judge caveats to rating methodology and limitations - Add FIPA, IEEE P3394, W3C WoT to related work with bibliography entries - Fix safety ratio to show monthly variation (1.5:1 to 21:1) Pipeline: - Fetch 1 new draft (475 total), 3 new authors (713 total) - Fix 16 ruff lint errors across test files - All 106 tests pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
135
scripts/compare-haiku-classifier.py
Normal file
135
scripts/compare-haiku-classifier.py
Normal file
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare Claude Haiku vs Ollama as pre-classifiers, using Claude Sonnet ratings as ground truth."""
|
||||
|
||||
import sqlite3
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
sys.path.insert(0, "src")
|
||||
|
||||
import anthropic
|
||||
from ietf_analyzer.config import Config
|
||||
|
||||
cfg = Config.load()
|
||||
conn = sqlite3.connect(cfg.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
HAIKU_PROMPT = """\
|
||||
You are classifying IETF Internet-Drafts for an AI/agent standards tracker.
|
||||
|
||||
A draft is RELEVANT if it relates to ANY of these topics:
|
||||
- AI agents, autonomous agents, multi-agent systems
|
||||
- Agent identity, authentication, authorization, discovery
|
||||
- Agent-to-agent (A2A) communication protocols
|
||||
- Large language models (LLMs), generative AI
|
||||
- Machine learning in network operations
|
||||
- AI safety, alignment, trustworthiness
|
||||
- Model Context Protocol (MCP), agentic workflows
|
||||
- OAuth/JWT/credentials for agents or AI systems
|
||||
- Autonomous network operations using AI
|
||||
- Intelligent network management or traffic handling
|
||||
|
||||
A draft is NOT relevant if it only covers:
|
||||
- Pure cryptography without AI/agent context
|
||||
- General networking protocols (BGP, DNS, TLS) without AI
|
||||
- Email, HTTP, or web standards without AI/agent features
|
||||
- Remote attestation (RATS) unless specifically for AI agents
|
||||
- Accessibility guidelines for user agents (browsers)
|
||||
|
||||
Title: {title}
|
||||
|
||||
Abstract: {abstract}
|
||||
|
||||
Is this draft relevant to AI agents or related topics? Answer ONLY "yes" or "no"."""
|
||||
|
||||
client = anthropic.Anthropic()
|
||||
|
||||
def haiku_classify(title, abstract):
|
||||
"""Classify with Haiku, using llm_cache to avoid repeat calls."""
|
||||
prompt = HAIKU_PROMPT.format(title=title, abstract=abstract[:2000])
|
||||
cache_key = hashlib.sha256(f"haiku-classify:{prompt}".encode()).hexdigest()
|
||||
|
||||
# Check cache
|
||||
cached = conn.execute("SELECT response_json FROM llm_cache WHERE prompt_hash=?", (cache_key,)).fetchone()
|
||||
if cached:
|
||||
return cached["response_json"].strip().lower().startswith("yes"), True
|
||||
|
||||
resp = client.messages.create(
|
||||
model=cfg.claude_model_cheap,
|
||||
max_tokens=10,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
answer = resp.content[0].text.strip().lower()
|
||||
|
||||
# Cache it
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO llm_cache (draft_name, prompt_hash, request_json, response_json, model, input_tokens, output_tokens) VALUES (?,?,?,?,?,?,?)",
|
||||
("_classify_", cache_key, prompt[:500], answer, cfg.claude_model_cheap, resp.usage.input_tokens, resp.usage.output_tokens),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
return answer.startswith("yes"), False
|
||||
|
||||
# Get all rated drafts
|
||||
rows = conn.execute("""
|
||||
SELECT d.name, d.title, d.abstract, r.relevance, r.false_positive
|
||||
FROM drafts d JOIN ratings r ON d.name = r.draft_name
|
||||
WHERE d.abstract IS NOT NULL AND d.abstract != ''
|
||||
ORDER BY d.name
|
||||
""").fetchall()
|
||||
|
||||
print(f"Classifying {len(rows)} drafts with Haiku...\n")
|
||||
|
||||
haiku_agree = 0
|
||||
haiku_fp = [] # Haiku=yes, Claude=no
|
||||
haiku_fn = [] # Haiku=no, Claude=yes
|
||||
total_tokens_in = 0
|
||||
total_tokens_out = 0
|
||||
cached_count = 0
|
||||
api_count = 0
|
||||
|
||||
for i, r in enumerate(rows):
|
||||
claude_relevant = not r["false_positive"] and r["relevance"] >= 3
|
||||
haiku_relevant, was_cached = haiku_classify(r["title"], r["abstract"])
|
||||
|
||||
if was_cached:
|
||||
cached_count += 1
|
||||
else:
|
||||
api_count += 1
|
||||
if api_count % 20 == 0:
|
||||
time.sleep(1) # rate limit
|
||||
|
||||
if haiku_relevant == claude_relevant:
|
||||
haiku_agree += 1
|
||||
elif haiku_relevant and not claude_relevant:
|
||||
haiku_fp.append({"name": r["name"], "title": r["title"][:60], "rel": r["relevance"], "fp": r["false_positive"]})
|
||||
else:
|
||||
haiku_fn.append({"name": r["name"], "title": r["title"][:60], "rel": r["relevance"], "fp": r["false_positive"]})
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" Processed {i+1}/{len(rows)} ({cached_count} cached, {api_count} API calls)...")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"HAIKU AGREEMENT with Claude Sonnet: {haiku_agree}/{len(rows)} ({100*haiku_agree/len(rows):.1f}%)")
|
||||
print(f"API calls: {api_count}, Cached: {cached_count}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
print(f"\nHaiku=RELEVANT but Sonnet=NOT ({len(haiku_fp)}):")
|
||||
for d in haiku_fp[:10]:
|
||||
fp = " [FP]" if d["fp"] else ""
|
||||
print(f" rel={d['rel']}{fp} | {d['name']}: {d['title']}")
|
||||
|
||||
print(f"\nHaiku=IRRELEVANT but Sonnet=RELEVANT ({len(haiku_fn)}):")
|
||||
for d in haiku_fn[:10]:
|
||||
print(f" rel={d['rel']} | {d['name']}: {d['title']}")
|
||||
|
||||
# Cost estimate
|
||||
avg_tokens_per_call = 800 # ~800 input tokens per classification
|
||||
cost_per_draft = (avg_tokens_per_call * 0.80 + 50 * 4.0) / 1_000_000 # Haiku pricing
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Cost estimate: ~${cost_per_draft:.5f}/draft = ~${cost_per_draft * len(rows):.3f} for {len(rows)} drafts")
|
||||
print(f"Ollama cost: $0 (but 66.9% agreement)")
|
||||
print(f"Haiku cost: ~${cost_per_draft * len(rows):.3f} ({100*haiku_agree/len(rows):.1f}% agreement)")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user