Fix security, data integrity, and accuracy issues from 4-perspective review

Security fixes: - Fix SQL injection in db.py:update_generation_run (column name whitelist) - Flask SECRET_KEY from env var instead of hardcoded - Add LLM rating bounds validation (_clamp_rating, 1-10) - Fix JSON extraction trailing whitespace handling Data integrity: - Normalize 21 legacy category names to 11 canonical short forms - Add false_positive column, flag 73 non-AI drafts (361 relevant remain) - Document verified counts: 434 total/361 relevant drafts, 557 authors, 419 ideas, 11 gaps Code quality: - Fix version string 0.1.0 → 0.2.0 - Add close()/context manager to Embedder class - Dynamic matrix size instead of hardcoded "260x260" Blog accuracy: - Fix EU AI Act timeline (enforcement Aug 2026, not "18 months") - Distinguish OAuth consent from GDPR Einwilligung - Add EU AI Act Annex III context to hospital scenario - Add FIPA, eIDAS 2.0 references where relevant Methodology: - Add methodology.md documenting pipeline, limitations, rating rubric - Add LLM-as-judge caveats to analyzer.py - Document clustering threshold rationale Reviews from: legal (German/EU law), statistics, development, science perspectives. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:52:33 +01:00
parent a386d0bb1a
commit 439424bd04
19 changed files with 1745 additions and 126 deletions
--- a/src/ietf_analyzer/analyzer.py
+++ b/src/ietf_analyzer/analyzer.py
@@ -36,6 +36,45 @@ CATEGORIES_SHORT = [
    "Other AI/agent",
 ]

+# ============================================================================
+# METHODOLOGY NOTE — LLM-as-Judge Rating Approach
+#
+# Limitations of this rating system (see also data/reports/methodology.md):
+#
+# 1. ABSTRACT-ONLY: Ratings are generated from the draft's abstract (truncated
+#    to 2000 chars), not the full text. Maturity and overlap scores in
+#    particular may be unreliable when the abstract omits key details.
+#
+# 2. NO HUMAN CALIBRATION: No inter-rater reliability study has been performed.
+#    Claude is the sole judge; scores have not been validated against human
+#    expert ratings. Even a small calibration set (20-30 drafts) would
+#    substantially strengthen confidence in the ratings.
+#
+# 3. NO INTRA-RATER CONSISTENCY CHECK: The same draft is never re-rated to
+#    measure Claude's self-consistency. Prompt-hash caching means re-runs
+#    return cached results, so actual consistency is untested.
+#
+# 4. OVERLAP SCORE LIMITATION: The overlap dimension asks Claude whether a
+#    draft overlaps with other known work, but Claude rates each draft
+#    independently — it does not have access to the full corpus during rating.
+#    The overlap score reflects Claude's general knowledge, not corpus-specific
+#    similarity. Use embedding-based similarity for corpus-level overlap.
+#
+# 5. BATCH EFFECTS: Batch rating (BATCH_PROMPT) processes multiple drafts
+#    together. Position effects and comparison effects are uncontrolled.
+#    Abstracts are also truncated more aggressively (1500 chars vs 2000).
+#
+# 6. RELEVANCE INFLATION: The relevance distribution is right-skewed because
+#    keyword-matched drafts tend to score high on relevance by construction.
+#    The corpus likely contains 30-50 false positives from ambiguous keywords
+#    like "agent" (user agent), "autonomous" (autonomous systems), and
+#    "intelligent" (intelligent networking).
+#
+# INTERPRETATION: Scores should be treated as RELATIVE RANKINGS within this
+# corpus, not as absolute quality measures. A score of 4.0 means "above
+# average for this corpus," not "objectively high quality."
+# ============================================================================
+
 # Compact prompt — abstract only, saves ~10x tokens vs full-text
 RATE_PROMPT_COMPACT = """\
 Rate this {doc_type}. JSON only.
@@ -45,7 +84,13 @@ Abstract: {abstract}

 Return JSON: {{"s":"2-3 sentence summary","n":<1-5>,"nn":"novelty note","m":<1-5>,"mn":"maturity note","o":<1-5>,"on":"overlap note","mo":<1-5>,"mon":"momentum note","r":<1-5>,"rn":"relevance note","c":["categories"]}}

-Scale: 1=very low..5=very high. Overlap: 1=unique,5=heavy overlap.
+Rating scale (use the FULL range 1-5, avoid clustering at 3-4):
+- Novelty: 1=trivial/obvious extension, 2=incremental, 3=useful contribution, 4=notable originality, 5=genuinely novel approach
+- Maturity: 1=problem statement only, 2=early sketch, 3=defined protocol/mechanism, 4=detailed spec with examples, 5=implementation-ready with test vectors
+- Overlap: 1=unique approach, 2=minor similarities, 3=shares concepts with 1-2 drafts, 4=significant overlap, 5=near-duplicate of existing work
+- Momentum: 1=inactive/abandoned, 2=single revision, 3=active development, 4=WG interest/adoption, 5=strong community momentum
+- Relevance: 1=not about AI/agents (false positive), 2=tangentially related, 3=partially relevant, 4=directly relevant, 5=core AI agent topic
+
 Categories: {categories}
 JSON only, no fences."""

@@ -89,6 +134,31 @@ Per idea: {{"title":"short name","description":"1 sentence","type":"mechanism|pr
 1-4 ideas per draft. Extract only TOP-LEVEL novel contributions. Do NOT list sub-features, optimizations, variants, or extensions as separate ideas. If a draft defines one protocol with multiple features, that is ONE idea, not several. Each idea must be independently novel — could it be its own draft? If not, merge it with the parent idea. Only include CONCRETE, NOVEL technical contributions. If a draft has no substantive ideas, map it to an empty array. Do not pad with restatements of the abstract.
 Return ONLY a JSON object like {{"draft-name":[...], ...}}, no fences."""

+# ============================================================================
+# GAP ANALYSIS METHODOLOGY NOTE
+#
+# This is a SINGLE-SHOT LLM analysis: Claude receives compressed statistics
+# about the landscape (category counts, top ideas, overlap summary) and
+# generates gaps in one pass. Limitations:
+#
+# 1. No systematic coverage analysis against a reference taxonomy. A rigorous
+#    approach would compare the corpus against an explicit reference architecture
+#    (e.g., NIST AI RMF, FIPA agent platform model, or a custom agent ecosystem
+#    reference model) to identify gaps systematically rather than relying on
+#    Claude's general knowledge.
+#
+# 2. The overlap_summary fed to the prompt is category-level only — it does not
+#    tell Claude which specific technical areas overlap within categories.
+#
+# 3. Evidence quality varies: some gaps cite specific data ("only N drafts"),
+#    others are based on Claude's inference about what is missing.
+#
+# 4. Gap severity is assigned by Claude in a single pass without defined
+#    thresholds (what makes "critical" vs "high" is implicit).
+#
+# Strengthening options: ground against a reference architecture, run multiple
+# independent gap analyses and intersect results, have domain experts validate.
+# ============================================================================
 GAP_ANALYSIS_PROMPT = """\
 You are analyzing the landscape of {total} IETF Internet-Drafts related to AI agents and autonomous systems.

@@ -158,15 +228,23 @@ class Analyzer:
            )
            raise SystemExit(1)

+    @staticmethod
+    def _clamp_rating(value, default: int = 3, lo: int = 1, hi: int = 10) -> int:
+        """Clamp a rating value to [lo, hi] integers."""
+        try:
+            return max(lo, min(hi, int(value)))
+        except (ValueError, TypeError):
+            return default
+
    def _parse_rating(self, draft_name: str, data: dict) -> Rating:
        """Parse a rating from compact JSON keys."""
        return Rating(
            draft_name=draft_name,
-            novelty=int(data.get("n", data.get("novelty", 3))),
-            maturity=int(data.get("m", data.get("maturity", 3))),
-            overlap=int(data.get("o", data.get("overlap", 3))),
-            momentum=int(data.get("mo", data.get("momentum", 3))),
-            relevance=int(data.get("r", data.get("relevance", 3))),
+            novelty=self._clamp_rating(data.get("n", data.get("novelty", 3))),
+            maturity=self._clamp_rating(data.get("m", data.get("maturity", 3))),
+            overlap=self._clamp_rating(data.get("o", data.get("overlap", 3))),
+            momentum=self._clamp_rating(data.get("mo", data.get("momentum", 3))),
+            relevance=self._clamp_rating(data.get("r", data.get("relevance", 3))),
            summary=data.get("s", data.get("summary", "")),
            novelty_note=data.get("nn", data.get("novelty_note", "")),
            maturity_note=data.get("mn", data.get("maturity_note", "")),
@@ -194,10 +272,11 @@ class Analyzer:

    def _extract_json(self, text: str) -> str:
        """Strip markdown fences if present."""
+        text = text.strip()
        if text.startswith("```"):
            text = text.split("\n", 1)[1]
-            if text.endswith("```"):
-                text = text[:-3]
+            if text.rstrip().endswith("```"):
+                text = text.rstrip()[:-3]
        return text.strip()

    def rate_draft(self, draft_name: str, use_cache: bool = True) -> Rating | None: