v0.3.0: Publication-ready release with blog site, paper update, and polish

Release prep: - Version bump to 0.3.0 (pyproject.toml, cli.py) - Rewrite README.md with current stats (475 drafts, 713 authors, 501 ideas) - Add CONTRIBUTING.md with dev setup and code conventions Blog site: - Add scripts/build-site.py (markdown → HTML with clean CSS, dark mode, nav) - Generate static site in docs/blog/ (10 pages) - Ready for GitHub Pages deployment Academic paper (paper/main.tex): - Update all counts: 474→475 drafts, 557→710 authors, 1907→462 ideas, 11→12 gaps - Add false-positive filtering methodology (113 excluded, 361 relevant) - Add cross-org convergence analysis (132 ideas, 33% rate) - Add GDPR compliance gap to gap table - Add LLM-as-judge caveats to rating methodology and limitations - Add FIPA, IEEE P3394, W3C WoT to related work with bibliography entries - Fix safety ratio to show monthly variation (1.5:1 to 21:1) Pipeline: - Fetch 1 new draft (475 total), 3 new authors (713 total) - Fix 16 ruff lint errors across test files - All 106 tests pass Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 17:54:43 +01:00
parent e247bfef8f
commit 1ec1f69bee
34 changed files with 4268 additions and 272 deletions
--- a/src/ietf_analyzer/sources/iso.py
+++ b/src/ietf_analyzer/sources/iso.py
@@ -0,0 +1,196 @@
+"""Fetch AI-related standards metadata from ISO/IEC JTC 1/SC 42.
+
+ISO provides open data (JSON/CSV/Parquet) for metadata but full text is paywalled.
+We use the open data portal to catalog SC 42 standards and supplement with
+publicly available scope/abstract from the ISO Online Browsing Platform (OBP).
+"""
+
+from __future__ import annotations
+
+import csv
+import io
+import re
+import time as time_mod
+
+import httpx
+from rich.console import Console
+
+from ..config import Config
+from .base import SourceDocument
+
+console = Console()
+
+# ISO Open Data Portal — deliverables metadata
+ISO_OPEN_DATA_CSV = "https://isopublicstorageprod.blob.core.windows.net/opendata/_latest/iso_deliverables_metadata/csv/iso_deliverables_metadata.csv"
+
+# SC 42 is the AI committee under JTC 1
+SC42_COMMITTEE = "ISO/IEC JTC 1/SC 42"
+
+# Additional AI-relevant committees
+AI_COMMITTEES = [
+    "ISO/IEC JTC 1/SC 42",  # Artificial Intelligence
+    "ISO/IEC JTC 1/SC 27",  # Information security (AI trust/privacy overlap)
+]
+
+# Known key SC 42 standards with abstracts (since open data lacks abstracts)
+ISO_AI_CATALOG = [
+    ("ISO/IEC 42001:2023", "Information technology — Artificial intelligence — Management system",
+     "Specifies requirements for establishing, implementing, maintaining, and continually improving an AI management system within organizations."),
+    ("ISO/IEC 22989:2022", "Information technology — Artificial intelligence — AI concepts and terminology",
+     "Establishes terminology and describes concepts in the field of artificial intelligence."),
+    ("ISO/IEC 23894:2023", "Information technology — Artificial intelligence — Guidance on risk management",
+     "Provides guidance on managing risk related to development and use of AI systems."),
+    ("ISO/IEC 23053:2022", "Framework for Artificial Intelligence (AI) Systems Using Machine Learning (ML)",
+     "Establishes a framework describing a generic AI system using ML technology."),
+    ("ISO/IEC 38507:2022", "Information technology — Governance of IT — Governance implications of the use of AI",
+     "Provides guidance on the governance implications of AI for governing bodies of organizations."),
+    ("ISO/IEC 5338:2023", "Information technology — AI system life cycle processes",
+     "Defines processes and their associated activities for the development and operation of AI systems."),
+    ("ISO/IEC 5339:2024", "Information technology — AI — Guidance for AI applications",
+     "Provides guidance on how to apply AI within organizations, including use case analysis and risk assessment."),
+    ("ISO/IEC 42005:2025", "Information technology — AI — AI system impact assessment",
+     "Provides guidance for assessing the potential positive and negative impacts of AI systems."),
+    ("ISO/IEC TR 24028:2020", "Overview of trustworthiness in artificial intelligence",
+     "Provides an overview of trustworthiness in AI, including risks, challenges, and approaches."),
+    ("ISO/IEC TR 24029-1:2021", "Assessment of the robustness of neural networks — Part 1: Overview",
+     "Provides background on properties of neural network robustness and methods for assessment."),
+    ("ISO/IEC TR 24030:2021", "Artificial intelligence — Use cases",
+     "Provides a collection of representative use cases of AI applications across various domains."),
+    ("ISO/IEC TS 6254:2024", "Objectives and approaches for explainability of ML models and AI systems",
+     "Describes objectives and approaches for explainability of machine learning models and AI systems."),
+    ("ISO/IEC 12792:2024", "Transparency taxonomy of AI systems",
+     "Establishes a transparency taxonomy for AI systems to support understanding and governance."),
+]
+
+ISO_OBP_URL = "https://www.iso.org/standard/"
+
+
+def _iso_id_to_name(iso_id: str) -> str:
+    """Convert ISO ID to slug. E.g. 'ISO/IEC 42001:2023' -> 'iso-iec-42001-2023'."""
+    slug = iso_id.lower().replace("/", "-").replace(" ", "-").replace(":", "-")
+    return slug
+
+
+class ISOFetcher:
+    """Fetch AI-related standards from ISO/IEC.
+
+    Combines:
+    1. Open data CSV for discovering SC 42 standards
+    2. Curated catalog with known abstracts
+    3. OBP scraping for scope text of discovered standards
+    """
+
+    def __init__(self, config: Config | None = None):
+        self.config = config or Config.load()
+        self.client = httpx.Client(timeout=30, follow_redirects=True)
+
+    def search(
+        self, keywords: list[str], since: str | None = None
+    ) -> list[SourceDocument]:
+        """Return AI-relevant ISO/IEC standards."""
+        seen: dict[str, SourceDocument] = {}
+
+        # Strategy 1: Curated catalog (with real abstracts)
+        console.print("  Loading ISO/IEC SC 42 catalog...")
+        for iso_id, title, abstract in ISO_AI_CATALOG:
+            name = _iso_id_to_name(iso_id)
+            seen[name] = SourceDocument(
+                name=name,
+                title=f"{iso_id}: {title}",
+                abstract=abstract,
+                source="iso",
+                source_id=iso_id,
+                source_url=f"https://www.iso.org/standard/{iso_id.split(':')[0].replace('/', '%2F').replace(' ', '%20')}.html",
+                time=self._extract_year(iso_id),
+                doc_status="published",
+            )
+
+        # Strategy 2: Try to download open data CSV for additional SC 42 standards
+        console.print("  Fetching ISO open data for SC 42 standards...")
+        open_data_docs = self._fetch_open_data(keywords, since)
+        for doc in open_data_docs:
+            if doc.name not in seen:
+                seen[doc.name] = doc
+
+        console.print(f"  Found [bold green]{len(seen)}[/] ISO/IEC AI standards")
+        return list(seen.values())
+
+    def _extract_year(self, iso_id: str) -> str:
+        """Extract year from ISO ID like 'ISO/IEC 42001:2023'."""
+        if ":" in iso_id:
+            return iso_id.split(":")[-1]
+        return ""
+
+    def _fetch_open_data(self, keywords: list[str], since: str | None) -> list[SourceDocument]:
+        """Fetch ISO open data CSV and filter for AI standards."""
+        docs = []
+        try:
+            console.print("  Downloading ISO deliverables catalog (CSV)...")
+            resp = self.client.get(ISO_OPEN_DATA_CSV, timeout=60)
+            resp.raise_for_status()
+
+            reader = csv.DictReader(io.StringIO(resp.text))
+            ai_keywords = ["artificial intelligence", "machine learning", "neural network",
+                           "ai system", "trustworth", "autonomous"]
+
+            for row in reader:
+                title = row.get("title.en", "")
+                committee = row.get("ownerCommittee", "")
+                ref = row.get("reference", "")
+                status = row.get("currentStage", "")
+                pub_date = row.get("publicationDate", "")
+                scope = row.get("scope.en", "")
+
+                # Filter: SC 42 committee OR AI keywords in title
+                is_sc42 = "SC 42" in committee
+                has_ai_keyword = any(kw in title.lower() for kw in ai_keywords)
+                if not (is_sc42 or has_ai_keyword):
+                    continue
+
+                if since and pub_date and pub_date < since:
+                    continue
+
+                name = _iso_id_to_name(ref)
+                docs.append(SourceDocument(
+                    name=name,
+                    title=f"{ref}: {title}",
+                    abstract=f"ISO/IEC standard: {title}. Committee: {committee}. Status: {status}.",
+                    source="iso",
+                    source_id=ref,
+                    source_url=f"https://www.iso.org/standard/{ref.split(':')[0].replace('/', '%2F').replace(' ', '%20')}.html",
+                    time=pub_date or self._extract_year(ref),
+                    doc_status=status.lower() if status else "published",
+                ))
+
+        except httpx.HTTPError as e:
+            console.print(f"[yellow]Could not fetch ISO open data: {e}[/]")
+        except Exception as e:
+            console.print(f"[yellow]Error parsing ISO CSV: {e}[/]")
+
+        return docs
+
+    def download_text(self, doc: SourceDocument) -> str | None:
+        """ISO full text is paywalled. Return abstract/scope only."""
+        # Try to scrape scope from ISO website
+        iso_id = doc.source_id.split(":")[0]  # e.g. "ISO/IEC 42001"
+        try:
+            # The OBP has scope text for some standards
+            url = f"https://www.iso.org/standard/{iso_id.replace('/', '%2F').replace(' ', '%20')}.html"
+            resp = self.client.get(url)
+            if resp.status_code == 200:
+                # Extract scope/abstract from page
+                scope_match = re.search(
+                    r'<div[^>]*id="item-abstract"[^>]*>(.*?)</div>',
+                    resp.text, re.DOTALL,
+                )
+                if scope_match:
+                    scope = re.sub(r'<[^>]+>', '', scope_match.group(1)).strip()
+                    if len(scope) > 30:
+                        return scope[:5000]
+            time_mod.sleep(0.5)
+        except httpx.HTTPError:
+            pass
+        return None
+
+    def close(self) -> None:
+        self.client.close()