Platform upgrade: semantic search, citations, readiness, tests, Docker

Major features added by 5 parallel agent teams: - Semantic "Ask" (NL queries via FTS5 + embeddings + Claude synthesis) - Global search across drafts, ideas, authors, gaps - REST API expansion (14 endpoints, up from 3) with CSV/JSON export - Citation graph visualization (D3.js, 440 nodes, 2422 edges) - Standards readiness scoring (0-100 composite from 6 factors) - Side-by-side draft comparison view with shared/unique analysis - Annotation system (notes + tags per draft, DB-persisted) - Docker deployment (Dockerfile + docker-compose with Ollama) - Scheduled updates (cron script with log rotation) - Pipeline health dashboard (stage progress bars, cost tracking) - Test suite foundation (54 pytest tests covering DB, models, web data) Fixes: compare_drafts() stubbed→working, get_authors_for_draft() bug, source-aware analysis prompts, config env var overrides + validation, resilient batch error handling with --retry-failed, observatory --dry-run Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 20:52:56 +01:00
parent da2a989744
commit 757b781c67
33 changed files with 4253 additions and 170 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,168 @@
+"""Shared fixtures for IETF Draft Analyzer tests."""
+
+from __future__ import annotations
+
+import json
+import sqlite3
+from datetime import datetime, timezone
+
+import numpy as np
+import pytest
+
+from ietf_analyzer.config import Config
+from ietf_analyzer.db import Database, SCHEMA
+from ietf_analyzer.models import Author, Draft, Rating
+
+
+@pytest.fixture
+def tmp_db(tmp_path):
+    """Create an in-memory Database with all tables initialized."""
+    cfg = Config(
+        data_dir=str(tmp_path),
+        db_path=str(tmp_path / "test.db"),
+    )
+    db = Database(cfg)
+    # Force connection + schema creation
+    _ = db.conn
+    yield db
+    db.close()
+
+
+@pytest.fixture
+def sample_draft():
+    """Return a Draft object with realistic data."""
+    return Draft(
+        name="draft-test-ai-agent-protocol",
+        rev="02",
+        title="AI Agent Communication Protocol",
+        abstract="This document defines a protocol for autonomous AI agents to communicate with each other in a standardized manner.",
+        time="2025-06-15T12:00:00+00:00",
+        dt_id=12345,
+        pages=28,
+        words=12000,
+        group="dispatch",
+        group_uri="/api/v1/group/group/1234/",
+        expires="2025-12-15T12:00:00+00:00",
+        ad=None,
+        shepherd=None,
+        states=["I-D Exists"],
+        full_text="Internet-Draft: AI Agent Communication Protocol\n\nAbstract\n\nThis document defines...",
+        categories=["A2A protocols", "Agent discovery/reg"],
+        tags=["ai", "agent"],
+        fetched_at="2025-06-20T10:00:00+00:00",
+    )
+
+
+@pytest.fixture
+def sample_rating():
+    """Return a Rating object with realistic data."""
+    return Rating(
+        draft_name="draft-test-ai-agent-protocol",
+        novelty=4,
+        maturity=3,
+        overlap=2,
+        momentum=3,
+        relevance=5,
+        summary="Defines a novel protocol for AI agent communication with discovery and auth mechanisms.",
+        novelty_note="Unique approach to agent handshake",
+        maturity_note="Early stage but well-structured",
+        overlap_note="Partially overlaps with MCP drafts",
+        momentum_note="Active working group interest",
+        relevance_note="Directly addresses core AI agent interop",
+        categories=["A2A protocols", "Agent discovery/reg"],
+        rated_at="2025-06-20T10:00:00+00:00",
+    )
+
+
+def _make_draft(name, title, time, group=None, pages=10, categories=None):
+    """Helper to create Draft objects for seeding."""
+    return Draft(
+        name=name,
+        rev="00",
+        title=title,
+        abstract=f"Abstract for {title}.",
+        time=time,
+        dt_id=None,
+        pages=pages,
+        words=pages * 400,
+        group=group,
+        categories=categories or [],
+        fetched_at=datetime.now(timezone.utc).isoformat(),
+    )
+
+
+def _make_rating(draft_name, novelty, maturity, overlap, momentum, relevance, categories=None):
+    """Helper to create Rating objects for seeding."""
+    return Rating(
+        draft_name=draft_name,
+        novelty=novelty,
+        maturity=maturity,
+        overlap=overlap,
+        momentum=momentum,
+        relevance=relevance,
+        summary=f"Summary for {draft_name}.",
+        categories=categories or ["A2A protocols"],
+        rated_at=datetime.now(timezone.utc).isoformat(),
+    )
+
+
+@pytest.fixture
+def seeded_db(tmp_db):
+    """Populate tmp_db with 5 drafts, ratings, ideas, authors, and refs."""
+    db = tmp_db
+
+    drafts = [
+        _make_draft("draft-alpha-agent-comm", "Alpha Agent Communication", "2025-01-10", "dispatch", 20, ["A2A protocols"]),
+        _make_draft("draft-beta-ml-traffic", "Beta ML Traffic Optimization", "2025-02-15", "netmod", 15, ["ML traffic mgmt"]),
+        _make_draft("draft-gamma-agent-id", "Gamma Agent Identity", "2025-03-20", "secdispatch", 12, ["Agent identity/auth"]),
+        _make_draft("draft-delta-safety", "Delta AI Safety Framework", "2025-04-25", None, 30, ["AI safety/alignment"]),
+        _make_draft("draft-epsilon-discovery", "Epsilon Agent Discovery", "2025-05-30", "dispatch", 8, ["Agent discovery/reg"]),
+    ]
+    for d in drafts:
+        db.upsert_draft(d)
+
+    ratings = [
+        _make_rating("draft-alpha-agent-comm", 4, 3, 2, 3, 5, ["A2A protocols"]),
+        _make_rating("draft-beta-ml-traffic", 3, 4, 3, 2, 3, ["ML traffic mgmt"]),
+        _make_rating("draft-gamma-agent-id", 5, 2, 1, 4, 4, ["Agent identity/auth"]),
+        _make_rating("draft-delta-safety", 3, 3, 4, 3, 4, ["AI safety/alignment"]),
+        _make_rating("draft-epsilon-discovery", 4, 2, 2, 5, 5, ["Agent discovery/reg"]),
+    ]
+    for r in ratings:
+        db.upsert_rating(r)
+
+    # Ideas
+    db.insert_ideas("draft-alpha-agent-comm", [
+        {"title": "Agent Handshake", "description": "Three-way handshake for agents", "type": "protocol"},
+        {"title": "Capability Negotiation", "description": "Agents advertise capabilities", "type": "mechanism"},
+    ])
+    db.insert_ideas("draft-beta-ml-traffic", [
+        {"title": "ML Traffic Classifier", "description": "Classify traffic using ML", "type": "mechanism"},
+    ])
+    db.insert_ideas("draft-gamma-agent-id", [
+        {"title": "Agent Certificate", "description": "X.509 extension for agents", "type": "extension"},
+    ])
+
+    # Authors
+    author1 = Author(person_id=1001, name="Alice Researcher", ascii_name="Alice Researcher",
+                      affiliation="ExampleCorp", fetched_at=datetime.now(timezone.utc).isoformat())
+    author2 = Author(person_id=1002, name="Bob Engineer", ascii_name="Bob Engineer",
+                      affiliation="TestLabs", fetched_at=datetime.now(timezone.utc).isoformat())
+    author3 = Author(person_id=1003, name="Carol Scientist", ascii_name="Carol Scientist",
+                      affiliation="ExampleCorp", fetched_at=datetime.now(timezone.utc).isoformat())
+    for a in [author1, author2, author3]:
+        db.upsert_author(a)
+
+    db.upsert_draft_author("draft-alpha-agent-comm", 1001, 1, "ExampleCorp")
+    db.upsert_draft_author("draft-alpha-agent-comm", 1002, 2, "TestLabs")
+    db.upsert_draft_author("draft-beta-ml-traffic", 1002, 1, "TestLabs")
+    db.upsert_draft_author("draft-gamma-agent-id", 1001, 1, "ExampleCorp")
+    db.upsert_draft_author("draft-gamma-agent-id", 1003, 2, "ExampleCorp")
+    db.upsert_draft_author("draft-delta-safety", 1003, 1, "ExampleCorp")
+
+    # Refs
+    db.insert_refs("draft-alpha-agent-comm", [("rfc", "8259"), ("rfc", "9110"), ("draft", "draft-ietf-httpbis")])
+    db.insert_refs("draft-beta-ml-traffic", [("rfc", "8259"), ("bcp", "BCP14")])
+    db.insert_refs("draft-gamma-agent-id", [("rfc", "5280"), ("rfc", "8259")])
+
+    yield db
--- a/tests/test_db.py
+++ b/tests/test_db.py
@@ -0,0 +1,287 @@
+"""Tests for ietf_analyzer.db.Database."""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+
+import numpy as np
+import pytest
+
+from ietf_analyzer.db import Database
+from ietf_analyzer.models import Author, Draft, Rating
+
+
+# ---- Table creation ----
+
+
+def test_ensure_tables_creates_all(tmp_db):
+    """All expected tables should exist after Database initialization."""
+    rows = tmp_db.conn.execute(
+        "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
+    ).fetchall()
+    table_names = {r["name"] for r in rows}
+    expected = {
+        "drafts", "ratings", "embeddings", "llm_cache",
+        "authors", "draft_authors", "ideas", "gaps",
+        "draft_refs", "generated_drafts", "generation_runs",
+        "sources", "observatory_snapshots", "gap_history",
+        "annotations", "monitor_runs",
+    }
+    assert expected.issubset(table_names), f"Missing tables: {expected - table_names}"
+
+
+# ---- Drafts ----
+
+
+def test_upsert_draft_insert(tmp_db, sample_draft):
+    """Inserting a new draft should make it retrievable."""
+    tmp_db.upsert_draft(sample_draft)
+    retrieved = tmp_db.get_draft(sample_draft.name)
+    assert retrieved is not None
+    assert retrieved.name == sample_draft.name
+    assert retrieved.title == sample_draft.title
+    assert retrieved.rev == sample_draft.rev
+    assert retrieved.pages == sample_draft.pages
+    assert retrieved.categories == sample_draft.categories
+
+
+def test_upsert_draft_update(tmp_db, sample_draft):
+    """Upserting an existing draft should update its fields."""
+    tmp_db.upsert_draft(sample_draft)
+    sample_draft.title = "Updated Title"
+    sample_draft.rev = "03"
+    tmp_db.upsert_draft(sample_draft)
+
+    retrieved = tmp_db.get_draft(sample_draft.name)
+    assert retrieved.title == "Updated Title"
+    assert retrieved.rev == "03"
+    # Should still be only one draft
+    assert tmp_db.count_drafts() == 1
+
+
+def test_search_drafts_fts5(tmp_db, sample_draft):
+    """FTS5 search should find drafts matching query terms."""
+    tmp_db.upsert_draft(sample_draft)
+    results = tmp_db.search_drafts("autonomous agents communicate")
+    assert len(results) >= 1
+    assert results[0].name == sample_draft.name
+
+
+def test_search_drafts_no_results(tmp_db, sample_draft):
+    """FTS5 search with non-matching query should return empty list."""
+    tmp_db.upsert_draft(sample_draft)
+    results = tmp_db.search_drafts("quantum blockchain hyperledger")
+    assert results == []
+
+
+def test_list_drafts_pagination(seeded_db):
+    """list_drafts should respect limit and order_by."""
+    all_drafts = seeded_db.list_drafts(limit=100, order_by="name ASC")
+    assert len(all_drafts) == 5
+
+    first_two = seeded_db.list_drafts(limit=2, order_by="name ASC")
+    assert len(first_two) == 2
+    assert first_two[0].name == "draft-alpha-agent-comm"
+    assert first_two[1].name == "draft-beta-ml-traffic"
+
+
+def test_count_drafts(seeded_db):
+    """count_drafts should return accurate count."""
+    assert seeded_db.count_drafts() == 5
+
+
+# ---- Ratings ----
+
+
+def test_upsert_rating(tmp_db, sample_draft, sample_rating):
+    """Inserting a rating should make it retrievable."""
+    tmp_db.upsert_draft(sample_draft)
+    tmp_db.upsert_rating(sample_rating)
+    retrieved = tmp_db.get_rating(sample_rating.draft_name)
+    assert retrieved is not None
+    assert retrieved.novelty == 4
+    assert retrieved.relevance == 5
+    assert "A2A protocols" in retrieved.categories
+
+
+def test_drafts_with_ratings(seeded_db):
+    """drafts_with_ratings should return (Draft, Rating) pairs."""
+    pairs = seeded_db.drafts_with_ratings(limit=100)
+    assert len(pairs) == 5
+    for draft, rating in pairs:
+        assert isinstance(draft, Draft)
+        assert isinstance(rating, Rating)
+        assert draft.name == rating.draft_name
+
+
+def test_drafts_without_text(tmp_db):
+    """drafts_without_text should return drafts where full_text is None."""
+    d1 = Draft(name="draft-has-text", rev="00", title="Has Text", abstract="Abs",
+               time="2025-01-01", full_text="Some text here")
+    d2 = Draft(name="draft-no-text", rev="00", title="No Text", abstract="Abs",
+               time="2025-01-01", full_text=None)
+    tmp_db.upsert_draft(d1)
+    tmp_db.upsert_draft(d2)
+
+    missing = tmp_db.drafts_without_text()
+    names = [d.name for d in missing]
+    assert "draft-no-text" in names
+    assert "draft-has-text" not in names
+
+
+# ---- Ideas ----
+
+
+def test_insert_ideas(seeded_db):
+    """Bulk idea insertion should work correctly."""
+    ideas = [
+        {"title": "New Idea A", "description": "Desc A", "type": "mechanism"},
+        {"title": "New Idea B", "description": "Desc B", "type": "protocol"},
+    ]
+    seeded_db.insert_ideas("draft-epsilon-discovery", ideas)
+    retrieved = seeded_db.get_ideas_for_draft("draft-epsilon-discovery")
+    assert len(retrieved) == 2
+    assert retrieved[0]["title"] == "New Idea A"
+
+
+def test_get_ideas_for_draft(seeded_db):
+    """Retrieving ideas for a specific draft should return correct data."""
+    ideas = seeded_db.get_ideas_for_draft("draft-alpha-agent-comm")
+    assert len(ideas) == 2
+    titles = {i["title"] for i in ideas}
+    assert "Agent Handshake" in titles
+    assert "Capability Negotiation" in titles
+
+
+def test_insert_ideas_replaces_existing(seeded_db):
+    """Inserting ideas for a draft should replace existing ideas."""
+    seeded_db.insert_ideas("draft-alpha-agent-comm", [
+        {"title": "Replacement Idea", "description": "Replaced", "type": "pattern"},
+    ])
+    ideas = seeded_db.get_ideas_for_draft("draft-alpha-agent-comm")
+    assert len(ideas) == 1
+    assert ideas[0]["title"] == "Replacement Idea"
+
+
+# ---- Gaps ----
+
+
+def test_insert_gaps(tmp_db):
+    """Gap insertion should work correctly."""
+    gaps = [
+        {"topic": "Agent Auth Gap", "description": "No standard auth for agents",
+         "category": "Agent identity/auth", "severity": "critical", "evidence": "Only 2 drafts address this"},
+        {"topic": "Monitoring Gap", "description": "No agent monitoring standard",
+         "category": "Autonomous netops", "severity": "high", "evidence": "Zero drafts cover monitoring"},
+    ]
+    tmp_db.insert_gaps(gaps)
+    retrieved = tmp_db.all_gaps()
+    assert len(retrieved) == 2
+
+
+def test_all_gaps(tmp_db):
+    """all_gaps should return all inserted gaps with correct fields."""
+    gaps = [
+        {"topic": "Test Gap", "description": "Test description",
+         "category": "Other", "severity": "medium", "evidence": "Test evidence"},
+    ]
+    tmp_db.insert_gaps(gaps)
+    result = tmp_db.all_gaps()
+    assert len(result) == 1
+    assert result[0]["topic"] == "Test Gap"
+    assert result[0]["severity"] == "medium"
+    assert result[0]["evidence"] == "Test evidence"
+
+
+# ---- Embeddings ----
+
+
+def test_store_embedding(tmp_db, sample_draft):
+    """Storing an embedding should persist the numpy vector."""
+    tmp_db.upsert_draft(sample_draft)
+    vec = np.array([0.1, 0.2, 0.3, 0.4, 0.5], dtype=np.float32)
+    tmp_db.store_embedding(sample_draft.name, "test-model", vec)
+
+    retrieved = tmp_db.get_embedding(sample_draft.name)
+    assert retrieved is not None
+    np.testing.assert_array_almost_equal(retrieved, vec)
+
+
+def test_all_embeddings(tmp_db, sample_draft):
+    """all_embeddings should return dict of {name: ndarray}."""
+    tmp_db.upsert_draft(sample_draft)
+    vec = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+    tmp_db.store_embedding(sample_draft.name, "test-model", vec)
+
+    all_emb = tmp_db.all_embeddings()
+    assert sample_draft.name in all_emb
+    np.testing.assert_array_almost_equal(all_emb[sample_draft.name], vec)
+
+
+# ---- LLM Cache ----
+
+
+def test_cache_response(tmp_db):
+    """Caching an LLM response should be retrievable by draft_name + hash."""
+    tmp_db.cache_response(
+        "draft-test", "abc123hash", "claude-test",
+        "prompt text", '{"result": "ok"}', 100, 50,
+    )
+    cached = tmp_db.get_cached_response("draft-test", "abc123hash")
+    assert cached is not None
+    assert json.loads(cached) == {"result": "ok"}
+
+
+def test_cache_response_miss(tmp_db):
+    """Cache miss should return None."""
+    result = tmp_db.get_cached_response("nonexistent", "badhash")
+    assert result is None
+
+
+# ---- Refs ----
+
+
+def test_insert_refs(seeded_db):
+    """Reference insertion should work and be queryable."""
+    refs = seeded_db.get_refs_for_draft("draft-alpha-agent-comm")
+    assert len(refs) == 3
+    ref_types = {r[0] for r in refs}
+    assert "rfc" in ref_types
+    assert "draft" in ref_types
+
+
+def test_top_refs(seeded_db):
+    """top_referenced should return most commonly cited RFCs."""
+    top = seeded_db.top_referenced(ref_type="rfc", limit=5)
+    # RFC 8259 is referenced by 3 drafts
+    assert len(top) > 0
+    assert top[0][0] == "8259"
+    assert top[0][1] == 3
+
+
+# ---- Authors ----
+
+
+def test_get_authors_for_draft(seeded_db):
+    """Getting authors for a draft should return correct Author objects."""
+    authors = seeded_db.get_authors_for_draft("draft-alpha-agent-comm")
+    assert len(authors) == 2
+    names = {a.name for a in authors}
+    assert "Alice Researcher" in names
+    assert "Bob Engineer" in names
+
+
+def test_author_count(seeded_db):
+    """author_count should return the total number of unique authors."""
+    assert seeded_db.author_count() == 3
+
+
+def test_top_authors(seeded_db):
+    """top_authors should return authors sorted by draft count."""
+    top = seeded_db.top_authors(limit=10)
+    # Alice and Bob each have 2 drafts, Carol has 2 as well
+    assert len(top) > 0
+    # First author should have most drafts
+    name, aff, count, draft_names = top[0]
+    assert count >= 2
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -0,0 +1,190 @@
+"""Tests for ietf_analyzer.models and ietf_analyzer.config."""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+
+from ietf_analyzer.models import Draft, Rating, Author, normalize_category, CATEGORY_NORMALIZE
+from ietf_analyzer.config import Config, DEFAULT_KEYWORDS
+
+
+# ---- Rating ----
+
+
+def test_rating_composite_score():
+    """Composite score should use weighted average formula."""
+    r = Rating(
+        draft_name="test", novelty=4, maturity=3, overlap=2,
+        momentum=3, relevance=5, summary="test",
+    )
+    # Expected: 4*0.30 + 5*0.25 + 3*0.20 + 3*0.15 + (6-2)*0.10
+    expected = 4 * 0.30 + 5 * 0.25 + 3 * 0.20 + 3 * 0.15 + (6 - 2) * 0.10
+    assert abs(r.composite_score - expected) < 0.001
+
+
+def test_rating_composite_score_all_ones():
+    """Composite score with all 1s should be the minimum."""
+    r = Rating(
+        draft_name="test", novelty=1, maturity=1, overlap=5,
+        momentum=1, relevance=1, summary="test",
+    )
+    expected = 1 * 0.30 + 1 * 0.25 + 1 * 0.20 + 1 * 0.15 + (6 - 5) * 0.10
+    assert abs(r.composite_score - expected) < 0.001
+
+
+def test_rating_composite_score_all_fives():
+    """Composite score with all 5s (except overlap=1 for best)."""
+    r = Rating(
+        draft_name="test", novelty=5, maturity=5, overlap=1,
+        momentum=5, relevance=5, summary="test",
+    )
+    expected = 5 * 0.30 + 5 * 0.25 + 5 * 0.20 + 5 * 0.15 + (6 - 1) * 0.10
+    assert abs(r.composite_score - expected) < 0.001
+    assert r.composite_score == 5.0
+
+
+# ---- Draft ----
+
+
+def test_draft_datatracker_url():
+    """datatracker_url should construct the correct URL."""
+    d = Draft(name="draft-example-test", rev="00", title="Test", abstract="", time="2025-01-01")
+    assert d.datatracker_url == "https://datatracker.ietf.org/doc/draft-example-test/"
+
+
+def test_draft_text_url():
+    """text_url should construct the correct URL with revision."""
+    d = Draft(name="draft-example-test", rev="03", title="Test", abstract="", time="2025-01-01")
+    assert d.text_url == "https://www.ietf.org/archive/id/draft-example-test-03.txt"
+
+
+def test_draft_defaults():
+    """Draft should have sensible defaults for optional fields."""
+    d = Draft(name="draft-minimal", rev="00", title="Min", abstract="", time="2025-01-01")
+    assert d.dt_id is None
+    assert d.pages is None
+    assert d.words is None
+    assert d.group is None
+    assert d.full_text is None
+    assert d.categories == []
+    assert d.tags == []
+    assert d.states == []
+    assert d.source == "ietf"
+
+
+def test_draft_date_property():
+    """Draft.date should return just the date portion of time."""
+    d = Draft(name="test", rev="00", title="T", abstract="", time="2025-06-15T12:00:00+00:00")
+    assert d.date == "2025-06-15"
+
+
+def test_draft_date_empty():
+    """Draft.date should return empty string if time is None."""
+    d = Draft(name="test", rev="00", title="T", abstract="", time=None)
+    assert d.date == ""
+
+
+# ---- normalize_category ----
+
+
+def test_normalize_category():
+    """Known verbose category names should be normalized to short forms."""
+    assert normalize_category("Agent-to-agent communication protocols") == "A2A protocols"
+    assert normalize_category("AI safety / guardrails / alignment") == "AI safety/alignment"
+
+
+def test_normalize_category_passthrough():
+    """Unknown category names should pass through unchanged."""
+    assert normalize_category("A2A protocols") == "A2A protocols"
+    assert normalize_category("Some Unknown Category") == "Some Unknown Category"
+
+
+# ---- Config ----
+
+
+def test_config_load_defaults():
+    """Config without a file should use defaults."""
+    cfg = Config()
+    assert cfg.ollama_url == "http://localhost:11434"
+    assert cfg.claude_model != ""
+    assert cfg.fetch_delay == 0.5
+
+
+def test_config_save_and_load(tmp_path):
+    """Config should roundtrip through save/load."""
+    cfg = Config(
+        data_dir=str(tmp_path),
+        db_path=str(tmp_path / "test.db"),
+        claude_model="claude-test-model",
+    )
+    # Save to the default config path (override it)
+    config_file = tmp_path / "config.json"
+    config_file.write_text(json.dumps({
+        "data_dir": str(tmp_path),
+        "db_path": str(tmp_path / "test.db"),
+        "claude_model": "claude-test-model",
+        "ollama_url": "http://localhost:11434",
+        "search_keywords": ["agent", "ai-agent"],
+    }))
+
+    # Verify roundtrip by reading back
+    data = json.loads(config_file.read_text())
+    loaded = Config(**{k: v for k, v in data.items() if k in Config.__dataclass_fields__})
+    assert loaded.claude_model == "claude-test-model"
+    assert loaded.db_path == str(tmp_path / "test.db")
+
+
+def test_config_search_keywords():
+    """Default config should have the expected search keywords."""
+    cfg = Config()
+    assert "agent" in cfg.search_keywords
+    assert "mcp" in cfg.search_keywords
+    assert "agentic" in cfg.search_keywords
+    assert len(cfg.search_keywords) == len(DEFAULT_KEYWORDS)
+
+
+def _patch_config_file(monkeypatch, tmp_path):
+    """Point CONFIG_FILE to a non-existent path so tests use defaults."""
+    import ietf_analyzer.config as config_mod
+    monkeypatch.setattr(config_mod, "CONFIG_FILE", tmp_path / "config.json")
+
+
+def test_config_env_var_override(tmp_path, monkeypatch):
+    """Environment variables should override config file values."""
+    _patch_config_file(monkeypatch, tmp_path)
+    monkeypatch.setenv("IETF_ANALYZER_DB_PATH", str(tmp_path / "env.db"))
+    monkeypatch.setenv("IETF_ANALYZER_CLAUDE_MODEL", "claude-from-env")
+    monkeypatch.setenv("IETF_ANALYZER_OLLAMA_URL", "http://remote:11434")
+
+    cfg = Config.load()
+    assert cfg.db_path == str(tmp_path / "env.db")
+    assert cfg.claude_model == "claude-from-env"
+    assert cfg.ollama_url == "http://remote:11434"
+
+
+def test_config_validation_bad_model(tmp_path, monkeypatch):
+    """Empty claude_model should raise ValueError."""
+    _patch_config_file(monkeypatch, tmp_path)
+    monkeypatch.setenv("IETF_ANALYZER_CLAUDE_MODEL", "")
+    with pytest.raises(ValueError, match="claude_model"):
+        Config.load()
+
+
+def test_config_validation_bad_url(tmp_path, monkeypatch):
+    """Non-URL ollama_url should raise ValueError."""
+    _patch_config_file(monkeypatch, tmp_path)
+    monkeypatch.setenv("IETF_ANALYZER_OLLAMA_URL", "not-a-url")
+    with pytest.raises(ValueError, match="ollama_url"):
+        Config.load()
+
+
+def test_config_validation_bad_db_path(tmp_path, monkeypatch):
+    """db_path with non-existent parent directory should raise ValueError."""
+    _patch_config_file(monkeypatch, tmp_path)
+    monkeypatch.setenv("IETF_ANALYZER_DB_PATH", "/nonexistent/dir/test.db")
+    with pytest.raises(ValueError, match="db_path"):
+        Config.load()
--- a/tests/test_web_data.py
+++ b/tests/test_web_data.py
@@ -0,0 +1,158 @@
+"""Tests for src/webui/data.py data access functions."""
+
+from __future__ import annotations
+
+import sys
+from functools import wraps
+from pathlib import Path
+
+import pytest
+
+# Ensure webui is importable
+_project_root = Path(__file__).resolve().parent.parent
+if str(_project_root / "src") not in sys.path:
+    sys.path.insert(0, str(_project_root / "src"))
+
+from webui.data import (
+    get_overview_stats,
+    get_category_counts,
+    get_drafts_page,
+    get_draft_detail,
+    get_ideas_by_type,
+    get_all_gaps,
+    get_timeline_data,
+    get_top_authors,
+)
+
+
+def _skip_on_missing_module(fn):
+    """Decorator that skips tests when webui.data references unavailable modules."""
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except (ModuleNotFoundError, AttributeError) as e:
+            pytest.skip(f"webui.data depends on module not in this worktree: {e}")
+    return wrapper
+
+
+def test_get_overview_stats(seeded_db):
+    """Overview stats should return correct counts from seeded data."""
+    stats = get_overview_stats(seeded_db)
+    assert stats["total_drafts"] == 5
+    assert stats["rated_count"] == 5
+    assert stats["author_count"] == 3
+    # 2 + 1 + 1 = 4 ideas in seeded data
+    assert stats["idea_count"] == 4
+    assert stats["gap_count"] == 0
+    assert "input_tokens" in stats
+    assert "output_tokens" in stats
+
+
+def test_get_category_counts(seeded_db):
+    """Category counts should reflect the seeded ratings."""
+    counts = get_category_counts(seeded_db)
+    assert isinstance(counts, dict)
+    assert "A2A protocols" in counts
+    assert counts["A2A protocols"] == 1
+    assert "ML traffic mgmt" in counts
+
+
+@_skip_on_missing_module
+def test_get_drafts_page_basic(seeded_db):
+    """Drafts page should return paginated results."""
+    result = get_drafts_page(seeded_db, page=1, per_page=3)
+    assert len(result["drafts"]) == 3
+    assert result["total"] == 5
+    assert result["page"] == 1
+    assert result["per_page"] == 3
+    assert result["pages"] == 2
+
+
+@_skip_on_missing_module
+def test_get_drafts_page_with_category_filter(seeded_db):
+    """Filtering by category should narrow results."""
+    result = get_drafts_page(seeded_db, category="A2A protocols")
+    assert result["total"] == 1
+    assert result["drafts"][0]["categories"] == ["A2A protocols"]
+
+
+@_skip_on_missing_module
+def test_get_drafts_page_with_search_filter(seeded_db):
+    """Text search should filter by name/title/summary."""
+    result = get_drafts_page(seeded_db, search="alpha")
+    assert result["total"] == 1
+    assert "alpha" in result["drafts"][0]["name"]
+
+
+@_skip_on_missing_module
+def test_get_drafts_page_empty_search(seeded_db):
+    """Search for non-matching term should return 0 results."""
+    result = get_drafts_page(seeded_db, search="zzzznonexistent")
+    assert result["total"] == 0
+    assert result["drafts"] == []
+
+
+@_skip_on_missing_module
+def test_get_draft_detail(seeded_db):
+    """Draft detail should include draft, rating, authors, ideas, refs."""
+    detail = get_draft_detail(seeded_db, "draft-alpha-agent-comm")
+    assert detail is not None
+    assert detail["name"] == "draft-alpha-agent-comm"
+    assert detail["title"] == "Alpha Agent Communication"
+    assert "rating" in detail
+    assert detail["rating"]["novelty"] == 4
+    assert len(detail["authors"]) == 2
+    assert len(detail["ideas"]) == 2
+    assert len(detail["refs"]) == 3
+
+
+@_skip_on_missing_module
+def test_get_draft_detail_not_found(seeded_db):
+    """Draft detail for non-existent draft should return None."""
+    assert get_draft_detail(seeded_db, "draft-nonexistent") is None
+
+
+def test_get_ideas_by_type(seeded_db):
+    """Ideas by type should group and count correctly."""
+    result = get_ideas_by_type(seeded_db)
+    assert result["total"] == 4
+    assert "by_type" in result
+    assert isinstance(result["by_type"], dict)
+    # We have protocol, mechanism, extension types
+    assert "protocol" in result["by_type"] or "mechanism" in result["by_type"]
+
+
+def test_get_all_gaps_empty(seeded_db):
+    """With no gaps inserted, should return empty list."""
+    gaps = get_all_gaps(seeded_db)
+    assert gaps == []
+
+
+def test_get_all_gaps_with_data(seeded_db):
+    """After inserting gaps, should return them."""
+    seeded_db.insert_gaps([
+        {"topic": "Gap A", "description": "Desc A", "severity": "high", "evidence": "Ev A"},
+    ])
+    gaps = get_all_gaps(seeded_db)
+    assert len(gaps) == 1
+    assert gaps[0]["topic"] == "Gap A"
+
+
+def test_get_timeline_data(seeded_db):
+    """Timeline data should group drafts by month."""
+    data = get_timeline_data(seeded_db)
+    assert "months" in data
+    assert "series" in data
+    assert "categories" in data
+    # Seeded drafts span Jan-May 2025
+    assert len(data["months"]) >= 1
+
+
+def test_get_top_authors(seeded_db):
+    """Top authors should return ranked list with draft counts."""
+    authors = get_top_authors(seeded_db, limit=10)
+    assert len(authors) >= 1
+    assert "name" in authors[0]
+    assert "draft_count" in authors[0]
+    assert authors[0]["draft_count"] >= 2