Performance: - Batch readiness computation (~200 queries → ~6 per page) - Batch draft lookup in author network (N+1 → single query) - File-based similarity matrix cache (.npy + metadata sidecar) - 5-minute TTL embedding cache for search queries CLI quality: - Add pass_cfg_db decorator, convert ~30 commands to shared config/db lifecycle - Add --dry-run to analyze, embed, embed-ideas, ideas, gaps commands - Move 15+ in-function imports to top of data.py Types & documentation: - Add 16 TypedDicts to data.py, annotate 12 function return types - Add ethics section to Post 06 (premature standardization, power asymmetry) - Add EU AI Act Article 43 conformity mapping to Post 06 - Add NIS2 and CRA references to Post 04 CI & testing: - Add GitHub Actions CI workflow (Python 3.11+3.12, ruff, pytest) - Add API documentation for all 20 endpoints (data/reports/api-docs.md) - Add 41 new tests (test_analyzer.py, test_search.py) — 64 total pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
113 lines
4.1 KiB
Python
113 lines
4.1 KiB
Python
"""Tests for ietf_analyzer.search — sanitize_fts_query."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from ietf_analyzer.search import HybridSearch
|
|
|
|
|
|
class TestSanitizeFtsQuery:
|
|
"""Test FTS5 query sanitization against injection and edge cases."""
|
|
|
|
def test_plain_query(self):
|
|
assert HybridSearch.sanitize_fts_query("agent protocol") == "agent protocol"
|
|
|
|
def test_strips_quotes(self):
|
|
result = HybridSearch.sanitize_fts_query('"agent" OR "protocol"')
|
|
assert '"' not in result
|
|
assert "agent" in result
|
|
|
|
def test_strips_parentheses(self):
|
|
result = HybridSearch.sanitize_fts_query("(agent AND protocol)")
|
|
assert "(" not in result
|
|
assert ")" not in result
|
|
|
|
def test_strips_asterisk(self):
|
|
result = HybridSearch.sanitize_fts_query("agent*")
|
|
assert "*" not in result
|
|
assert "agent" in result
|
|
|
|
def test_removes_boolean_OR(self):
|
|
result = HybridSearch.sanitize_fts_query("agent OR protocol")
|
|
assert "OR" not in result
|
|
assert "agent" in result
|
|
assert "protocol" in result
|
|
|
|
def test_removes_boolean_AND(self):
|
|
result = HybridSearch.sanitize_fts_query("agent AND protocol")
|
|
assert "AND" not in result
|
|
|
|
def test_removes_boolean_NOT(self):
|
|
result = HybridSearch.sanitize_fts_query("agent NOT malicious")
|
|
assert "NOT" not in result
|
|
assert "malicious" in result
|
|
|
|
def test_removes_NEAR(self):
|
|
result = HybridSearch.sanitize_fts_query("agent NEAR protocol")
|
|
assert "NEAR" not in result
|
|
|
|
def test_case_insensitive_operators(self):
|
|
result = HybridSearch.sanitize_fts_query("agent or protocol")
|
|
assert " or " not in result
|
|
# "or" as standalone word should be removed
|
|
words = result.split()
|
|
assert "or" not in [w.lower() for w in words]
|
|
|
|
def test_injection_attempt_column_filter(self):
|
|
"""FTS5 column filter syntax should be stripped."""
|
|
result = HybridSearch.sanitize_fts_query("title:agent")
|
|
# The colon is stripped, leaving just "titleagent" or "title agent"
|
|
assert ":" not in result
|
|
|
|
def test_injection_attempt_special_chars(self):
|
|
result = HybridSearch.sanitize_fts_query('"; DROP TABLE drafts; --')
|
|
assert ";" not in result
|
|
assert '"' not in result
|
|
assert "--" not in result
|
|
|
|
def test_empty_query(self):
|
|
assert HybridSearch.sanitize_fts_query("") == ""
|
|
|
|
def test_only_operators(self):
|
|
result = HybridSearch.sanitize_fts_query("OR AND NOT")
|
|
assert result.strip() == ""
|
|
|
|
def test_only_special_chars(self):
|
|
result = HybridSearch.sanitize_fts_query('"*(){}[]')
|
|
assert result.strip() == ""
|
|
|
|
def test_collapses_whitespace(self):
|
|
result = HybridSearch.sanitize_fts_query("agent protocol test")
|
|
assert result == "agent protocol test"
|
|
|
|
def test_preserves_numbers(self):
|
|
result = HybridSearch.sanitize_fts_query("rfc 8259")
|
|
assert result == "rfc 8259"
|
|
|
|
def test_preserves_underscores(self):
|
|
result = HybridSearch.sanitize_fts_query("ai_agent_protocol")
|
|
assert result == "ai_agent_protocol"
|
|
|
|
def test_unicode_preserved(self):
|
|
"""Non-ASCII alphanumeric characters should be preserved."""
|
|
result = HybridSearch.sanitize_fts_query("müller agent")
|
|
assert "müller" in result or "mller" in result # depends on \w locale
|
|
|
|
def test_mixed_injection(self):
|
|
"""Complex injection attempt with multiple vectors."""
|
|
result = HybridSearch.sanitize_fts_query(
|
|
'(agent* NEAR/5 "protocol") OR title:exploit NOT safe'
|
|
)
|
|
# NEAR/5 becomes NEAR5 after stripping the slash, which is no longer
|
|
# a standalone NEAR operator — it's just a harmless token.
|
|
assert "OR" not in result.split()
|
|
assert "NOT" not in result.split()
|
|
assert "*" not in result
|
|
assert '"' not in result
|
|
assert "(" not in result
|
|
assert ":" not in result
|
|
# Core words should survive
|
|
assert "agent" in result
|
|
assert "protocol" in result
|