Release prep: - Version bump to 0.3.0 (pyproject.toml, cli.py) - Rewrite README.md with current stats (475 drafts, 713 authors, 501 ideas) - Add CONTRIBUTING.md with dev setup and code conventions Blog site: - Add scripts/build-site.py (markdown → HTML with clean CSS, dark mode, nav) - Generate static site in docs/blog/ (10 pages) - Ready for GitHub Pages deployment Academic paper (paper/main.tex): - Update all counts: 474→475 drafts, 557→710 authors, 1907→462 ideas, 11→12 gaps - Add false-positive filtering methodology (113 excluded, 361 relevant) - Add cross-org convergence analysis (132 ideas, 33% rate) - Add GDPR compliance gap to gap table - Add LLM-as-judge caveats to rating methodology and limitations - Add FIPA, IEEE P3394, W3C WoT to related work with bibliography entries - Fix safety ratio to show monthly variation (1.5:1 to 21:1) Pipeline: - Fetch 1 new draft (475 total), 3 new authors (713 total) - Fix 16 ruff lint errors across test files - All 106 tests pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
112 lines
4.0 KiB
Python
112 lines
4.0 KiB
Python
"""Tests for ietf_analyzer.search — sanitize_fts_query."""
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
from ietf_analyzer.search import HybridSearch
|
|
|
|
|
|
class TestSanitizeFtsQuery:
|
|
"""Test FTS5 query sanitization against injection and edge cases."""
|
|
|
|
def test_plain_query(self):
|
|
assert HybridSearch.sanitize_fts_query("agent protocol") == "agent protocol"
|
|
|
|
def test_strips_quotes(self):
|
|
result = HybridSearch.sanitize_fts_query('"agent" OR "protocol"')
|
|
assert '"' not in result
|
|
assert "agent" in result
|
|
|
|
def test_strips_parentheses(self):
|
|
result = HybridSearch.sanitize_fts_query("(agent AND protocol)")
|
|
assert "(" not in result
|
|
assert ")" not in result
|
|
|
|
def test_strips_asterisk(self):
|
|
result = HybridSearch.sanitize_fts_query("agent*")
|
|
assert "*" not in result
|
|
assert "agent" in result
|
|
|
|
def test_removes_boolean_OR(self):
|
|
result = HybridSearch.sanitize_fts_query("agent OR protocol")
|
|
assert "OR" not in result
|
|
assert "agent" in result
|
|
assert "protocol" in result
|
|
|
|
def test_removes_boolean_AND(self):
|
|
result = HybridSearch.sanitize_fts_query("agent AND protocol")
|
|
assert "AND" not in result
|
|
|
|
def test_removes_boolean_NOT(self):
|
|
result = HybridSearch.sanitize_fts_query("agent NOT malicious")
|
|
assert "NOT" not in result
|
|
assert "malicious" in result
|
|
|
|
def test_removes_NEAR(self):
|
|
result = HybridSearch.sanitize_fts_query("agent NEAR protocol")
|
|
assert "NEAR" not in result
|
|
|
|
def test_case_insensitive_operators(self):
|
|
result = HybridSearch.sanitize_fts_query("agent or protocol")
|
|
assert " or " not in result
|
|
# "or" as standalone word should be removed
|
|
words = result.split()
|
|
assert "or" not in [w.lower() for w in words]
|
|
|
|
def test_injection_attempt_column_filter(self):
|
|
"""FTS5 column filter syntax should be stripped."""
|
|
result = HybridSearch.sanitize_fts_query("title:agent")
|
|
# The colon is stripped, leaving just "titleagent" or "title agent"
|
|
assert ":" not in result
|
|
|
|
def test_injection_attempt_special_chars(self):
|
|
result = HybridSearch.sanitize_fts_query('"; DROP TABLE drafts; --')
|
|
assert ";" not in result
|
|
assert '"' not in result
|
|
assert "--" not in result
|
|
|
|
def test_empty_query(self):
|
|
assert HybridSearch.sanitize_fts_query("") == ""
|
|
|
|
def test_only_operators(self):
|
|
result = HybridSearch.sanitize_fts_query("OR AND NOT")
|
|
assert result.strip() == ""
|
|
|
|
def test_only_special_chars(self):
|
|
result = HybridSearch.sanitize_fts_query('"*(){}[]')
|
|
assert result.strip() == ""
|
|
|
|
def test_collapses_whitespace(self):
|
|
result = HybridSearch.sanitize_fts_query("agent protocol test")
|
|
assert result == "agent protocol test"
|
|
|
|
def test_preserves_numbers(self):
|
|
result = HybridSearch.sanitize_fts_query("rfc 8259")
|
|
assert result == "rfc 8259"
|
|
|
|
def test_preserves_underscores(self):
|
|
result = HybridSearch.sanitize_fts_query("ai_agent_protocol")
|
|
assert result == "ai_agent_protocol"
|
|
|
|
def test_unicode_preserved(self):
|
|
"""Non-ASCII alphanumeric characters should be preserved."""
|
|
result = HybridSearch.sanitize_fts_query("müller agent")
|
|
assert "müller" in result or "mller" in result # depends on \w locale
|
|
|
|
def test_mixed_injection(self):
|
|
"""Complex injection attempt with multiple vectors."""
|
|
result = HybridSearch.sanitize_fts_query(
|
|
'(agent* NEAR/5 "protocol") OR title:exploit NOT safe'
|
|
)
|
|
# NEAR/5 becomes NEAR5 after stripping the slash, which is no longer
|
|
# a standalone NEAR operator — it's just a harmless token.
|
|
assert "OR" not in result.split()
|
|
assert "NOT" not in result.split()
|
|
assert "*" not in result
|
|
assert '"' not in result
|
|
assert "(" not in result
|
|
assert ":" not in result
|
|
# Core words should survive
|
|
assert "agent" in result
|
|
assert "protocol" in result
|