"""Fetch AI-related standards metadata from ISO/IEC JTC 1/SC 42. ISO provides open data (JSON/CSV/Parquet) for metadata but full text is paywalled. We use the open data portal to catalog SC 42 standards and supplement with publicly available scope/abstract from the ISO Online Browsing Platform (OBP). """ from __future__ import annotations import csv import io import re import time as time_mod import httpx from rich.console import Console from ..config import Config from .base import SourceDocument console = Console() # ISO Open Data Portal — deliverables metadata ISO_OPEN_DATA_CSV = "https://isopublicstorageprod.blob.core.windows.net/opendata/_latest/iso_deliverables_metadata/csv/iso_deliverables_metadata.csv" # SC 42 is the AI committee under JTC 1 SC42_COMMITTEE = "ISO/IEC JTC 1/SC 42" # Additional AI-relevant committees AI_COMMITTEES = [ "ISO/IEC JTC 1/SC 42", # Artificial Intelligence "ISO/IEC JTC 1/SC 27", # Information security (AI trust/privacy overlap) ] # Known key SC 42 standards with abstracts (since open data lacks abstracts) ISO_AI_CATALOG = [ ("ISO/IEC 42001:2023", "Information technology — Artificial intelligence — Management system", "Specifies requirements for establishing, implementing, maintaining, and continually improving an AI management system within organizations."), ("ISO/IEC 22989:2022", "Information technology — Artificial intelligence — AI concepts and terminology", "Establishes terminology and describes concepts in the field of artificial intelligence."), ("ISO/IEC 23894:2023", "Information technology — Artificial intelligence — Guidance on risk management", "Provides guidance on managing risk related to development and use of AI systems."), ("ISO/IEC 23053:2022", "Framework for Artificial Intelligence (AI) Systems Using Machine Learning (ML)", "Establishes a framework describing a generic AI system using ML technology."), ("ISO/IEC 38507:2022", "Information technology — Governance of IT — Governance implications of the use of AI", "Provides guidance on the governance implications of AI for governing bodies of organizations."), ("ISO/IEC 5338:2023", "Information technology — AI system life cycle processes", "Defines processes and their associated activities for the development and operation of AI systems."), ("ISO/IEC 5339:2024", "Information technology — AI — Guidance for AI applications", "Provides guidance on how to apply AI within organizations, including use case analysis and risk assessment."), ("ISO/IEC 42005:2025", "Information technology — AI — AI system impact assessment", "Provides guidance for assessing the potential positive and negative impacts of AI systems."), ("ISO/IEC TR 24028:2020", "Overview of trustworthiness in artificial intelligence", "Provides an overview of trustworthiness in AI, including risks, challenges, and approaches."), ("ISO/IEC TR 24029-1:2021", "Assessment of the robustness of neural networks — Part 1: Overview", "Provides background on properties of neural network robustness and methods for assessment."), ("ISO/IEC TR 24030:2021", "Artificial intelligence — Use cases", "Provides a collection of representative use cases of AI applications across various domains."), ("ISO/IEC TS 6254:2024", "Objectives and approaches for explainability of ML models and AI systems", "Describes objectives and approaches for explainability of machine learning models and AI systems."), ("ISO/IEC 12792:2024", "Transparency taxonomy of AI systems", "Establishes a transparency taxonomy for AI systems to support understanding and governance."), ] ISO_OBP_URL = "https://www.iso.org/standard/" def _iso_id_to_name(iso_id: str) -> str: """Convert ISO ID to slug. E.g. 'ISO/IEC 42001:2023' -> 'iso-iec-42001-2023'.""" slug = iso_id.lower().replace("/", "-").replace(" ", "-").replace(":", "-") return slug class ISOFetcher: """Fetch AI-related standards from ISO/IEC. Combines: 1. Open data CSV for discovering SC 42 standards 2. Curated catalog with known abstracts 3. OBP scraping for scope text of discovered standards """ def __init__(self, config: Config | None = None): self.config = config or Config.load() self.client = httpx.Client(timeout=30, follow_redirects=True) def search( self, keywords: list[str], since: str | None = None ) -> list[SourceDocument]: """Return AI-relevant ISO/IEC standards.""" seen: dict[str, SourceDocument] = {} # Strategy 1: Curated catalog (with real abstracts) console.print(" Loading ISO/IEC SC 42 catalog...") for iso_id, title, abstract in ISO_AI_CATALOG: name = _iso_id_to_name(iso_id) seen[name] = SourceDocument( name=name, title=f"{iso_id}: {title}", abstract=abstract, source="iso", source_id=iso_id, source_url=f"https://www.iso.org/standard/{iso_id.split(':')[0].replace('/', '%2F').replace(' ', '%20')}.html", time=self._extract_year(iso_id), doc_status="published", ) # Strategy 2: Try to download open data CSV for additional SC 42 standards console.print(" Fetching ISO open data for SC 42 standards...") open_data_docs = self._fetch_open_data(keywords, since) for doc in open_data_docs: if doc.name not in seen: seen[doc.name] = doc console.print(f" Found [bold green]{len(seen)}[/] ISO/IEC AI standards") return list(seen.values()) def _extract_year(self, iso_id: str) -> str: """Extract year from ISO ID like 'ISO/IEC 42001:2023'.""" if ":" in iso_id: return iso_id.split(":")[-1] return "" def _fetch_open_data(self, keywords: list[str], since: str | None) -> list[SourceDocument]: """Fetch ISO open data CSV and filter for AI standards.""" docs = [] try: console.print(" Downloading ISO deliverables catalog (CSV)...") resp = self.client.get(ISO_OPEN_DATA_CSV, timeout=60) resp.raise_for_status() reader = csv.DictReader(io.StringIO(resp.text)) ai_keywords = ["artificial intelligence", "machine learning", "neural network", "ai system", "trustworth", "autonomous"] for row in reader: title = row.get("title.en", "") committee = row.get("ownerCommittee", "") ref = row.get("reference", "") status = row.get("currentStage", "") pub_date = row.get("publicationDate", "") scope = row.get("scope.en", "") # Filter: SC 42 committee OR AI keywords in title is_sc42 = "SC 42" in committee has_ai_keyword = any(kw in title.lower() for kw in ai_keywords) if not (is_sc42 or has_ai_keyword): continue if since and pub_date and pub_date < since: continue name = _iso_id_to_name(ref) abstract = scope[:2000] if scope else f"ISO/IEC standard: {title}. Committee: {committee}." docs.append(SourceDocument( name=name, title=f"{ref}: {title}", abstract=abstract, source="iso", source_id=ref, source_url=f"https://www.iso.org/standard/{ref.split(':')[0].replace('/', '%2F').replace(' ', '%20')}.html", time=pub_date or self._extract_year(ref), doc_status=status.lower() if status else "published", )) except httpx.HTTPError as e: console.print(f"[yellow]Could not fetch ISO open data: {e}[/]") except Exception as e: console.print(f"[yellow]Error parsing ISO CSV: {e}[/]") return docs def download_text(self, doc: SourceDocument) -> str | None: """ISO full text is paywalled. Return abstract/scope only.""" # Try to scrape scope from ISO website iso_id = doc.source_id.split(":")[0] # e.g. "ISO/IEC 42001" try: # The OBP has scope text for some standards url = f"https://www.iso.org/standard/{iso_id.replace('/', '%2F').replace(' ', '%20')}.html" resp = self.client.get(url) if resp.status_code == 200: # Extract scope/abstract from page scope_match = re.search( r'