Complete remaining medium/low issues: performance, CLI, types, CI, tests

Performance:
- Batch readiness computation (~200 queries → ~6 per page)
- Batch draft lookup in author network (N+1 → single query)
- File-based similarity matrix cache (.npy + metadata sidecar)
- 5-minute TTL embedding cache for search queries

CLI quality:
- Add pass_cfg_db decorator, convert ~30 commands to shared config/db lifecycle
- Add --dry-run to analyze, embed, embed-ideas, ideas, gaps commands
- Move 15+ in-function imports to top of data.py

Types & documentation:
- Add 16 TypedDicts to data.py, annotate 12 function return types
- Add ethics section to Post 06 (premature standardization, power asymmetry)
- Add EU AI Act Article 43 conformity mapping to Post 06
- Add NIS2 and CRA references to Post 04

CI & testing:
- Add GitHub Actions CI workflow (Python 3.11+3.12, ruff, pytest)
- Add API documentation for all 20 endpoints (data/reports/api-docs.md)
- Add 41 new tests (test_analyzer.py, test_search.py) — 64 total pass

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-08 14:06:54 +01:00
parent e7527ad68e
commit 20c45a7eba
14 changed files with 2305 additions and 1238 deletions

112
tests/test_search.py Normal file
View File

@@ -0,0 +1,112 @@
"""Tests for ietf_analyzer.search — sanitize_fts_query."""
from __future__ import annotations
import pytest
from ietf_analyzer.search import HybridSearch
class TestSanitizeFtsQuery:
"""Test FTS5 query sanitization against injection and edge cases."""
def test_plain_query(self):
assert HybridSearch.sanitize_fts_query("agent protocol") == "agent protocol"
def test_strips_quotes(self):
result = HybridSearch.sanitize_fts_query('"agent" OR "protocol"')
assert '"' not in result
assert "agent" in result
def test_strips_parentheses(self):
result = HybridSearch.sanitize_fts_query("(agent AND protocol)")
assert "(" not in result
assert ")" not in result
def test_strips_asterisk(self):
result = HybridSearch.sanitize_fts_query("agent*")
assert "*" not in result
assert "agent" in result
def test_removes_boolean_OR(self):
result = HybridSearch.sanitize_fts_query("agent OR protocol")
assert "OR" not in result
assert "agent" in result
assert "protocol" in result
def test_removes_boolean_AND(self):
result = HybridSearch.sanitize_fts_query("agent AND protocol")
assert "AND" not in result
def test_removes_boolean_NOT(self):
result = HybridSearch.sanitize_fts_query("agent NOT malicious")
assert "NOT" not in result
assert "malicious" in result
def test_removes_NEAR(self):
result = HybridSearch.sanitize_fts_query("agent NEAR protocol")
assert "NEAR" not in result
def test_case_insensitive_operators(self):
result = HybridSearch.sanitize_fts_query("agent or protocol")
assert " or " not in result
# "or" as standalone word should be removed
words = result.split()
assert "or" not in [w.lower() for w in words]
def test_injection_attempt_column_filter(self):
"""FTS5 column filter syntax should be stripped."""
result = HybridSearch.sanitize_fts_query("title:agent")
# The colon is stripped, leaving just "titleagent" or "title agent"
assert ":" not in result
def test_injection_attempt_special_chars(self):
result = HybridSearch.sanitize_fts_query('"; DROP TABLE drafts; --')
assert ";" not in result
assert '"' not in result
assert "--" not in result
def test_empty_query(self):
assert HybridSearch.sanitize_fts_query("") == ""
def test_only_operators(self):
result = HybridSearch.sanitize_fts_query("OR AND NOT")
assert result.strip() == ""
def test_only_special_chars(self):
result = HybridSearch.sanitize_fts_query('"*(){}[]')
assert result.strip() == ""
def test_collapses_whitespace(self):
result = HybridSearch.sanitize_fts_query("agent protocol test")
assert result == "agent protocol test"
def test_preserves_numbers(self):
result = HybridSearch.sanitize_fts_query("rfc 8259")
assert result == "rfc 8259"
def test_preserves_underscores(self):
result = HybridSearch.sanitize_fts_query("ai_agent_protocol")
assert result == "ai_agent_protocol"
def test_unicode_preserved(self):
"""Non-ASCII alphanumeric characters should be preserved."""
result = HybridSearch.sanitize_fts_query("müller agent")
assert "müller" in result or "mller" in result # depends on \w locale
def test_mixed_injection(self):
"""Complex injection attempt with multiple vectors."""
result = HybridSearch.sanitize_fts_query(
'(agent* NEAR/5 "protocol") OR title:exploit NOT safe'
)
# NEAR/5 becomes NEAR5 after stripping the slash, which is no longer
# a standalone NEAR operator — it's just a harmless token.
assert "OR" not in result.split()
assert "NOT" not in result.split()
assert "*" not in result
assert '"' not in result
assert "(" not in result
assert ":" not in result
# Core words should survive
assert "agent" in result
assert "protocol" in result