ietf-draft-analyzer/tests/test_db.py

"""Tests for ietf_analyzer.db.Database."""

from __future__ import annotations

import json
from datetime import datetime, timezone

import numpy as np
import pytest

from ietf_analyzer.db import Database
from ietf_analyzer.models import Author, Draft, Rating


# ---- Table creation ----


def test_ensure_tables_creates_all(tmp_db):
    """All expected tables should exist after Database initialization."""
    rows = tmp_db.conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
    ).fetchall()
    table_names = {r["name"] for r in rows}
    expected = {
        "drafts", "ratings", "embeddings", "llm_cache",
        "authors", "draft_authors", "ideas", "gaps",
        "draft_refs", "generated_drafts", "generation_runs",
        "sources", "observatory_snapshots", "gap_history",
        "annotations", "monitor_runs",
    }
    assert expected.issubset(table_names), f"Missing tables: {expected - table_names}"


# ---- Drafts ----


def test_upsert_draft_insert(tmp_db, sample_draft):
    """Inserting a new draft should make it retrievable."""
    tmp_db.upsert_draft(sample_draft)
    retrieved = tmp_db.get_draft(sample_draft.name)
    assert retrieved is not None
    assert retrieved.name == sample_draft.name
    assert retrieved.title == sample_draft.title
    assert retrieved.rev == sample_draft.rev
    assert retrieved.pages == sample_draft.pages
    assert retrieved.categories == sample_draft.categories


def test_upsert_draft_update(tmp_db, sample_draft):
    """Upserting an existing draft should update its fields."""
    tmp_db.upsert_draft(sample_draft)
    sample_draft.title = "Updated Title"
    sample_draft.rev = "03"
    tmp_db.upsert_draft(sample_draft)

    retrieved = tmp_db.get_draft(sample_draft.name)
    assert retrieved.title == "Updated Title"
    assert retrieved.rev == "03"
    # Should still be only one draft
    assert tmp_db.count_drafts() == 1


def test_search_drafts_fts5(tmp_db, sample_draft):
    """FTS5 search should find drafts matching query terms."""
    tmp_db.upsert_draft(sample_draft)
    results = tmp_db.search_drafts("autonomous agents communicate")
    assert len(results) >= 1
    assert results[0].name == sample_draft.name


def test_search_drafts_no_results(tmp_db, sample_draft):
    """FTS5 search with non-matching query should return empty list."""
    tmp_db.upsert_draft(sample_draft)
    results = tmp_db.search_drafts("quantum blockchain hyperledger")
    assert results == []


def test_list_drafts_pagination(seeded_db):
    """list_drafts should respect limit and order_by."""
    all_drafts = seeded_db.list_drafts(limit=100, order_by="name ASC")
    assert len(all_drafts) == 5

    first_two = seeded_db.list_drafts(limit=2, order_by="name ASC")
    assert len(first_two) == 2
    assert first_two[0].name == "draft-alpha-agent-comm"
    assert first_two[1].name == "draft-beta-ml-traffic"


def test_count_drafts(seeded_db):
    """count_drafts should return accurate count."""
    assert seeded_db.count_drafts() == 5


# ---- Ratings ----


def test_upsert_rating(tmp_db, sample_draft, sample_rating):
    """Inserting a rating should make it retrievable."""
    tmp_db.upsert_draft(sample_draft)
    tmp_db.upsert_rating(sample_rating)
    retrieved = tmp_db.get_rating(sample_rating.draft_name)
    assert retrieved is not None
    assert retrieved.novelty == 4
    assert retrieved.relevance == 5
    assert "A2A protocols" in retrieved.categories


def test_drafts_with_ratings(seeded_db):
    """drafts_with_ratings should return (Draft, Rating) pairs."""
    pairs = seeded_db.drafts_with_ratings(limit=100)
    assert len(pairs) == 5
    for draft, rating in pairs:
        assert isinstance(draft, Draft)
        assert isinstance(rating, Rating)
        assert draft.name == rating.draft_name


def test_drafts_without_text(tmp_db):
    """drafts_without_text should return drafts where full_text is None."""
    d1 = Draft(name="draft-has-text", rev="00", title="Has Text", abstract="Abs",
               time="2025-01-01", full_text="Some text here")
    d2 = Draft(name="draft-no-text", rev="00", title="No Text", abstract="Abs",
               time="2025-01-01", full_text=None)
    tmp_db.upsert_draft(d1)
    tmp_db.upsert_draft(d2)

    missing = tmp_db.drafts_without_text()
    names = [d.name for d in missing]
    assert "draft-no-text" in names
    assert "draft-has-text" not in names


# ---- Ideas ----


def test_insert_ideas(seeded_db):
    """Bulk idea insertion should work correctly."""
    ideas = [
        {"title": "New Idea A", "description": "Desc A", "type": "mechanism"},
        {"title": "New Idea B", "description": "Desc B", "type": "protocol"},
    ]
    seeded_db.insert_ideas("draft-epsilon-discovery", ideas)
    retrieved = seeded_db.get_ideas_for_draft("draft-epsilon-discovery")
    assert len(retrieved) == 2
    assert retrieved[0]["title"] == "New Idea A"


def test_get_ideas_for_draft(seeded_db):
    """Retrieving ideas for a specific draft should return correct data."""
    ideas = seeded_db.get_ideas_for_draft("draft-alpha-agent-comm")
    assert len(ideas) == 2
    titles = {i["title"] for i in ideas}
    assert "Agent Handshake" in titles
    assert "Capability Negotiation" in titles


def test_insert_ideas_replaces_existing(seeded_db):
    """Inserting ideas for a draft should replace existing ideas."""
    seeded_db.insert_ideas("draft-alpha-agent-comm", [
        {"title": "Replacement Idea", "description": "Replaced", "type": "pattern"},
    ])
    ideas = seeded_db.get_ideas_for_draft("draft-alpha-agent-comm")
    assert len(ideas) == 1
    assert ideas[0]["title"] == "Replacement Idea"


# ---- Gaps ----


def test_insert_gaps(tmp_db):
    """Gap insertion should work correctly."""
    gaps = [
        {"topic": "Agent Auth Gap", "description": "No standard auth for agents",
         "category": "Agent identity/auth", "severity": "critical", "evidence": "Only 2 drafts address this"},
        {"topic": "Monitoring Gap", "description": "No agent monitoring standard",
         "category": "Autonomous netops", "severity": "high", "evidence": "Zero drafts cover monitoring"},
    ]
    tmp_db.insert_gaps(gaps)
    retrieved = tmp_db.all_gaps()
    assert len(retrieved) == 2


def test_all_gaps(tmp_db):
    """all_gaps should return all inserted gaps with correct fields."""
    gaps = [
        {"topic": "Test Gap", "description": "Test description",
         "category": "Other", "severity": "medium", "evidence": "Test evidence"},
    ]
    tmp_db.insert_gaps(gaps)
    result = tmp_db.all_gaps()
    assert len(result) == 1
    assert result[0]["topic"] == "Test Gap"
    assert result[0]["severity"] == "medium"
    assert result[0]["evidence"] == "Test evidence"


# ---- Embeddings ----


def test_store_embedding(tmp_db, sample_draft):
    """Storing an embedding should persist the numpy vector."""
    tmp_db.upsert_draft(sample_draft)
    vec = np.array([0.1, 0.2, 0.3, 0.4, 0.5], dtype=np.float32)
    tmp_db.store_embedding(sample_draft.name, "test-model", vec)

    retrieved = tmp_db.get_embedding(sample_draft.name)
    assert retrieved is not None
    np.testing.assert_array_almost_equal(retrieved, vec)


def test_all_embeddings(tmp_db, sample_draft):
    """all_embeddings should return dict of {name: ndarray}."""
    tmp_db.upsert_draft(sample_draft)
    vec = np.array([1.0, 2.0, 3.0], dtype=np.float32)
    tmp_db.store_embedding(sample_draft.name, "test-model", vec)

    all_emb = tmp_db.all_embeddings()
    assert sample_draft.name in all_emb
    np.testing.assert_array_almost_equal(all_emb[sample_draft.name], vec)


# ---- LLM Cache ----


def test_cache_response(tmp_db):
    """Caching an LLM response should be retrievable by draft_name + hash."""
    tmp_db.cache_response(
        "draft-test", "abc123hash", "claude-test",
        "prompt text", '{"result": "ok"}', 100, 50,
    )
    cached = tmp_db.get_cached_response("draft-test", "abc123hash")
    assert cached is not None
    assert json.loads(cached) == {"result": "ok"}


def test_cache_response_miss(tmp_db):
    """Cache miss should return None."""
    result = tmp_db.get_cached_response("nonexistent", "badhash")
    assert result is None


# ---- Refs ----


def test_insert_refs(seeded_db):
    """Reference insertion should work and be queryable."""
    refs = seeded_db.get_refs_for_draft("draft-alpha-agent-comm")
    assert len(refs) == 3
    ref_types = {r[0] for r in refs}
    assert "rfc" in ref_types
    assert "draft" in ref_types


def test_top_refs(seeded_db):
    """top_referenced should return most commonly cited RFCs."""
    top = seeded_db.top_referenced(ref_type="rfc", limit=5)
    # RFC 8259 is referenced by 3 drafts
    assert len(top) > 0
    assert top[0][0] == "8259"
    assert top[0][1] == 3


# ---- Authors ----


def test_get_authors_for_draft(seeded_db):
    """Getting authors for a draft should return correct Author objects."""
    authors = seeded_db.get_authors_for_draft("draft-alpha-agent-comm")
    assert len(authors) == 2
    names = {a.name for a in authors}
    assert "Alice Researcher" in names
    assert "Bob Engineer" in names


def test_author_count(seeded_db):
    """author_count should return the total number of unique authors."""
    assert seeded_db.author_count() == 3


def test_top_authors(seeded_db):
    """top_authors should return authors sorted by draft count."""
    top = seeded_db.top_authors(limit=10)
    # Alice and Bob each have 2 drafts, Carol has 2 as well
    assert len(top) > 0
    # First author should have most drafts
    name, aff, count, draft_names = top[0]
    assert count >= 2