"""Tests for ietf_analyzer.search — sanitize_fts_query.""" from __future__ import annotations from ietf_analyzer.search import HybridSearch class TestSanitizeFtsQuery: """Test FTS5 query sanitization against injection and edge cases.""" def test_plain_query(self): assert HybridSearch.sanitize_fts_query("agent protocol") == "agent protocol" def test_strips_quotes(self): result = HybridSearch.sanitize_fts_query('"agent" OR "protocol"') assert '"' not in result assert "agent" in result def test_strips_parentheses(self): result = HybridSearch.sanitize_fts_query("(agent AND protocol)") assert "(" not in result assert ")" not in result def test_strips_asterisk(self): result = HybridSearch.sanitize_fts_query("agent*") assert "*" not in result assert "agent" in result def test_removes_boolean_OR(self): result = HybridSearch.sanitize_fts_query("agent OR protocol") assert "OR" not in result assert "agent" in result assert "protocol" in result def test_removes_boolean_AND(self): result = HybridSearch.sanitize_fts_query("agent AND protocol") assert "AND" not in result def test_removes_boolean_NOT(self): result = HybridSearch.sanitize_fts_query("agent NOT malicious") assert "NOT" not in result assert "malicious" in result def test_removes_NEAR(self): result = HybridSearch.sanitize_fts_query("agent NEAR protocol") assert "NEAR" not in result def test_case_insensitive_operators(self): result = HybridSearch.sanitize_fts_query("agent or protocol") assert " or " not in result # "or" as standalone word should be removed words = result.split() assert "or" not in [w.lower() for w in words] def test_injection_attempt_column_filter(self): """FTS5 column filter syntax should be stripped.""" result = HybridSearch.sanitize_fts_query("title:agent") # The colon is stripped, leaving just "titleagent" or "title agent" assert ":" not in result def test_injection_attempt_special_chars(self): result = HybridSearch.sanitize_fts_query('"; DROP TABLE drafts; --') assert ";" not in result assert '"' not in result assert "--" not in result def test_empty_query(self): assert HybridSearch.sanitize_fts_query("") == "" def test_only_operators(self): result = HybridSearch.sanitize_fts_query("OR AND NOT") assert result.strip() == "" def test_only_special_chars(self): result = HybridSearch.sanitize_fts_query('"*(){}[]') assert result.strip() == "" def test_collapses_whitespace(self): result = HybridSearch.sanitize_fts_query("agent protocol test") assert result == "agent protocol test" def test_preserves_numbers(self): result = HybridSearch.sanitize_fts_query("rfc 8259") assert result == "rfc 8259" def test_preserves_underscores(self): result = HybridSearch.sanitize_fts_query("ai_agent_protocol") assert result == "ai_agent_protocol" def test_unicode_preserved(self): """Non-ASCII alphanumeric characters should be preserved.""" result = HybridSearch.sanitize_fts_query("müller agent") assert "müller" in result or "mller" in result # depends on \w locale def test_mixed_injection(self): """Complex injection attempt with multiple vectors.""" result = HybridSearch.sanitize_fts_query( '(agent* NEAR/5 "protocol") OR title:exploit NOT safe' ) # NEAR/5 becomes NEAR5 after stripping the slash, which is no longer # a standalone NEAR operator — it's just a harmless token. assert "OR" not in result.split() assert "NOT" not in result.split() assert "*" not in result assert '"' not in result assert "(" not in result assert ":" not in result # Core words should survive assert "agent" in result assert "protocol" in result