Files
ietf-draft-analyzer/tests/test_search.py
Christian Nennemann 1ec1f69bee v0.3.0: Publication-ready release with blog site, paper update, and polish
Release prep:
- Version bump to 0.3.0 (pyproject.toml, cli.py)
- Rewrite README.md with current stats (475 drafts, 713 authors, 501 ideas)
- Add CONTRIBUTING.md with dev setup and code conventions

Blog site:
- Add scripts/build-site.py (markdown → HTML with clean CSS, dark mode, nav)
- Generate static site in docs/blog/ (10 pages)
- Ready for GitHub Pages deployment

Academic paper (paper/main.tex):
- Update all counts: 474→475 drafts, 557→710 authors, 1907→462 ideas, 11→12 gaps
- Add false-positive filtering methodology (113 excluded, 361 relevant)
- Add cross-org convergence analysis (132 ideas, 33% rate)
- Add GDPR compliance gap to gap table
- Add LLM-as-judge caveats to rating methodology and limitations
- Add FIPA, IEEE P3394, W3C WoT to related work with bibliography entries
- Fix safety ratio to show monthly variation (1.5:1 to 21:1)

Pipeline:
- Fetch 1 new draft (475 total), 3 new authors (713 total)
- Fix 16 ruff lint errors across test files
- All 106 tests pass

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 17:54:43 +01:00

112 lines
4.0 KiB
Python

"""Tests for ietf_analyzer.search — sanitize_fts_query."""
from __future__ import annotations
from ietf_analyzer.search import HybridSearch
class TestSanitizeFtsQuery:
"""Test FTS5 query sanitization against injection and edge cases."""
def test_plain_query(self):
assert HybridSearch.sanitize_fts_query("agent protocol") == "agent protocol"
def test_strips_quotes(self):
result = HybridSearch.sanitize_fts_query('"agent" OR "protocol"')
assert '"' not in result
assert "agent" in result
def test_strips_parentheses(self):
result = HybridSearch.sanitize_fts_query("(agent AND protocol)")
assert "(" not in result
assert ")" not in result
def test_strips_asterisk(self):
result = HybridSearch.sanitize_fts_query("agent*")
assert "*" not in result
assert "agent" in result
def test_removes_boolean_OR(self):
result = HybridSearch.sanitize_fts_query("agent OR protocol")
assert "OR" not in result
assert "agent" in result
assert "protocol" in result
def test_removes_boolean_AND(self):
result = HybridSearch.sanitize_fts_query("agent AND protocol")
assert "AND" not in result
def test_removes_boolean_NOT(self):
result = HybridSearch.sanitize_fts_query("agent NOT malicious")
assert "NOT" not in result
assert "malicious" in result
def test_removes_NEAR(self):
result = HybridSearch.sanitize_fts_query("agent NEAR protocol")
assert "NEAR" not in result
def test_case_insensitive_operators(self):
result = HybridSearch.sanitize_fts_query("agent or protocol")
assert " or " not in result
# "or" as standalone word should be removed
words = result.split()
assert "or" not in [w.lower() for w in words]
def test_injection_attempt_column_filter(self):
"""FTS5 column filter syntax should be stripped."""
result = HybridSearch.sanitize_fts_query("title:agent")
# The colon is stripped, leaving just "titleagent" or "title agent"
assert ":" not in result
def test_injection_attempt_special_chars(self):
result = HybridSearch.sanitize_fts_query('"; DROP TABLE drafts; --')
assert ";" not in result
assert '"' not in result
assert "--" not in result
def test_empty_query(self):
assert HybridSearch.sanitize_fts_query("") == ""
def test_only_operators(self):
result = HybridSearch.sanitize_fts_query("OR AND NOT")
assert result.strip() == ""
def test_only_special_chars(self):
result = HybridSearch.sanitize_fts_query('"*(){}[]')
assert result.strip() == ""
def test_collapses_whitespace(self):
result = HybridSearch.sanitize_fts_query("agent protocol test")
assert result == "agent protocol test"
def test_preserves_numbers(self):
result = HybridSearch.sanitize_fts_query("rfc 8259")
assert result == "rfc 8259"
def test_preserves_underscores(self):
result = HybridSearch.sanitize_fts_query("ai_agent_protocol")
assert result == "ai_agent_protocol"
def test_unicode_preserved(self):
"""Non-ASCII alphanumeric characters should be preserved."""
result = HybridSearch.sanitize_fts_query("müller agent")
assert "müller" in result or "mller" in result # depends on \w locale
def test_mixed_injection(self):
"""Complex injection attempt with multiple vectors."""
result = HybridSearch.sanitize_fts_query(
'(agent* NEAR/5 "protocol") OR title:exploit NOT safe'
)
# NEAR/5 becomes NEAR5 after stripping the slash, which is no longer
# a standalone NEAR operator — it's just a harmless token.
assert "OR" not in result.split()
assert "NOT" not in result.split()
assert "*" not in result
assert '"' not in result
assert "(" not in result
assert ":" not in result
# Core words should survive
assert "agent" in result
assert "protocol" in result