- Add `ietf auto` command: fetches, analyzes, embeds, extracts ideas, and refreshes gaps across all sources with cost-based auto-approval - Fix SourceDocument→Draft conversion in auto fetch step - Fix gap_analysis method name in auto command - Process all 270 unrated ETSI/ISO/ITU/NIST drafts (761 total, all rated) - Update web UI templates and data layer for multi-source support Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
198 lines
9.0 KiB
Python
198 lines
9.0 KiB
Python
"""Fetch AI-related standards metadata from ISO/IEC JTC 1/SC 42.
|
|
|
|
ISO provides open data (JSON/CSV/Parquet) for metadata but full text is paywalled.
|
|
We use the open data portal to catalog SC 42 standards and supplement with
|
|
publicly available scope/abstract from the ISO Online Browsing Platform (OBP).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import io
|
|
import re
|
|
import time as time_mod
|
|
|
|
import httpx
|
|
from rich.console import Console
|
|
|
|
from ..config import Config
|
|
from .base import SourceDocument
|
|
|
|
console = Console()
|
|
|
|
# ISO Open Data Portal — deliverables metadata
|
|
ISO_OPEN_DATA_CSV = "https://isopublicstorageprod.blob.core.windows.net/opendata/_latest/iso_deliverables_metadata/csv/iso_deliverables_metadata.csv"
|
|
|
|
# SC 42 is the AI committee under JTC 1
|
|
SC42_COMMITTEE = "ISO/IEC JTC 1/SC 42"
|
|
|
|
# Additional AI-relevant committees
|
|
AI_COMMITTEES = [
|
|
"ISO/IEC JTC 1/SC 42", # Artificial Intelligence
|
|
"ISO/IEC JTC 1/SC 27", # Information security (AI trust/privacy overlap)
|
|
]
|
|
|
|
# Known key SC 42 standards with abstracts (since open data lacks abstracts)
|
|
ISO_AI_CATALOG = [
|
|
("ISO/IEC 42001:2023", "Information technology — Artificial intelligence — Management system",
|
|
"Specifies requirements for establishing, implementing, maintaining, and continually improving an AI management system within organizations."),
|
|
("ISO/IEC 22989:2022", "Information technology — Artificial intelligence — AI concepts and terminology",
|
|
"Establishes terminology and describes concepts in the field of artificial intelligence."),
|
|
("ISO/IEC 23894:2023", "Information technology — Artificial intelligence — Guidance on risk management",
|
|
"Provides guidance on managing risk related to development and use of AI systems."),
|
|
("ISO/IEC 23053:2022", "Framework for Artificial Intelligence (AI) Systems Using Machine Learning (ML)",
|
|
"Establishes a framework describing a generic AI system using ML technology."),
|
|
("ISO/IEC 38507:2022", "Information technology — Governance of IT — Governance implications of the use of AI",
|
|
"Provides guidance on the governance implications of AI for governing bodies of organizations."),
|
|
("ISO/IEC 5338:2023", "Information technology — AI system life cycle processes",
|
|
"Defines processes and their associated activities for the development and operation of AI systems."),
|
|
("ISO/IEC 5339:2024", "Information technology — AI — Guidance for AI applications",
|
|
"Provides guidance on how to apply AI within organizations, including use case analysis and risk assessment."),
|
|
("ISO/IEC 42005:2025", "Information technology — AI — AI system impact assessment",
|
|
"Provides guidance for assessing the potential positive and negative impacts of AI systems."),
|
|
("ISO/IEC TR 24028:2020", "Overview of trustworthiness in artificial intelligence",
|
|
"Provides an overview of trustworthiness in AI, including risks, challenges, and approaches."),
|
|
("ISO/IEC TR 24029-1:2021", "Assessment of the robustness of neural networks — Part 1: Overview",
|
|
"Provides background on properties of neural network robustness and methods for assessment."),
|
|
("ISO/IEC TR 24030:2021", "Artificial intelligence — Use cases",
|
|
"Provides a collection of representative use cases of AI applications across various domains."),
|
|
("ISO/IEC TS 6254:2024", "Objectives and approaches for explainability of ML models and AI systems",
|
|
"Describes objectives and approaches for explainability of machine learning models and AI systems."),
|
|
("ISO/IEC 12792:2024", "Transparency taxonomy of AI systems",
|
|
"Establishes a transparency taxonomy for AI systems to support understanding and governance."),
|
|
]
|
|
|
|
ISO_OBP_URL = "https://www.iso.org/standard/"
|
|
|
|
|
|
def _iso_id_to_name(iso_id: str) -> str:
|
|
"""Convert ISO ID to slug. E.g. 'ISO/IEC 42001:2023' -> 'iso-iec-42001-2023'."""
|
|
slug = iso_id.lower().replace("/", "-").replace(" ", "-").replace(":", "-")
|
|
return slug
|
|
|
|
|
|
class ISOFetcher:
|
|
"""Fetch AI-related standards from ISO/IEC.
|
|
|
|
Combines:
|
|
1. Open data CSV for discovering SC 42 standards
|
|
2. Curated catalog with known abstracts
|
|
3. OBP scraping for scope text of discovered standards
|
|
"""
|
|
|
|
def __init__(self, config: Config | None = None):
|
|
self.config = config or Config.load()
|
|
self.client = httpx.Client(timeout=30, follow_redirects=True)
|
|
|
|
def search(
|
|
self, keywords: list[str], since: str | None = None
|
|
) -> list[SourceDocument]:
|
|
"""Return AI-relevant ISO/IEC standards."""
|
|
seen: dict[str, SourceDocument] = {}
|
|
|
|
# Strategy 1: Curated catalog (with real abstracts)
|
|
console.print(" Loading ISO/IEC SC 42 catalog...")
|
|
for iso_id, title, abstract in ISO_AI_CATALOG:
|
|
name = _iso_id_to_name(iso_id)
|
|
seen[name] = SourceDocument(
|
|
name=name,
|
|
title=f"{iso_id}: {title}",
|
|
abstract=abstract,
|
|
source="iso",
|
|
source_id=iso_id,
|
|
source_url=f"https://www.iso.org/standard/{iso_id.split(':')[0].replace('/', '%2F').replace(' ', '%20')}.html",
|
|
time=self._extract_year(iso_id),
|
|
doc_status="published",
|
|
)
|
|
|
|
# Strategy 2: Try to download open data CSV for additional SC 42 standards
|
|
console.print(" Fetching ISO open data for SC 42 standards...")
|
|
open_data_docs = self._fetch_open_data(keywords, since)
|
|
for doc in open_data_docs:
|
|
if doc.name not in seen:
|
|
seen[doc.name] = doc
|
|
|
|
console.print(f" Found [bold green]{len(seen)}[/] ISO/IEC AI standards")
|
|
return list(seen.values())
|
|
|
|
def _extract_year(self, iso_id: str) -> str:
|
|
"""Extract year from ISO ID like 'ISO/IEC 42001:2023'."""
|
|
if ":" in iso_id:
|
|
return iso_id.split(":")[-1]
|
|
return ""
|
|
|
|
def _fetch_open_data(self, keywords: list[str], since: str | None) -> list[SourceDocument]:
|
|
"""Fetch ISO open data CSV and filter for AI standards."""
|
|
docs = []
|
|
try:
|
|
console.print(" Downloading ISO deliverables catalog (CSV)...")
|
|
resp = self.client.get(ISO_OPEN_DATA_CSV, timeout=60)
|
|
resp.raise_for_status()
|
|
|
|
reader = csv.DictReader(io.StringIO(resp.text))
|
|
ai_keywords = ["artificial intelligence", "machine learning", "neural network",
|
|
"ai system", "trustworth", "autonomous"]
|
|
|
|
for row in reader:
|
|
title = row.get("title.en", "")
|
|
committee = row.get("ownerCommittee", "")
|
|
ref = row.get("reference", "")
|
|
status = row.get("currentStage", "")
|
|
pub_date = row.get("publicationDate", "")
|
|
scope = row.get("scope.en", "")
|
|
|
|
# Filter: SC 42 committee OR AI keywords in title
|
|
is_sc42 = "SC 42" in committee
|
|
has_ai_keyword = any(kw in title.lower() for kw in ai_keywords)
|
|
if not (is_sc42 or has_ai_keyword):
|
|
continue
|
|
|
|
if since and pub_date and pub_date < since:
|
|
continue
|
|
|
|
name = _iso_id_to_name(ref)
|
|
abstract = scope[:2000] if scope else f"ISO/IEC standard: {title}. Committee: {committee}."
|
|
docs.append(SourceDocument(
|
|
name=name,
|
|
title=f"{ref}: {title}",
|
|
abstract=abstract,
|
|
source="iso",
|
|
source_id=ref,
|
|
source_url=f"https://www.iso.org/standard/{ref.split(':')[0].replace('/', '%2F').replace(' ', '%20')}.html",
|
|
time=pub_date or self._extract_year(ref),
|
|
doc_status=status.lower() if status else "published",
|
|
))
|
|
|
|
except httpx.HTTPError as e:
|
|
console.print(f"[yellow]Could not fetch ISO open data: {e}[/]")
|
|
except Exception as e:
|
|
console.print(f"[yellow]Error parsing ISO CSV: {e}[/]")
|
|
|
|
return docs
|
|
|
|
def download_text(self, doc: SourceDocument) -> str | None:
|
|
"""ISO full text is paywalled. Return abstract/scope only."""
|
|
# Try to scrape scope from ISO website
|
|
iso_id = doc.source_id.split(":")[0] # e.g. "ISO/IEC 42001"
|
|
try:
|
|
# The OBP has scope text for some standards
|
|
url = f"https://www.iso.org/standard/{iso_id.replace('/', '%2F').replace(' ', '%20')}.html"
|
|
resp = self.client.get(url)
|
|
if resp.status_code == 200:
|
|
# Extract scope/abstract from page
|
|
scope_match = re.search(
|
|
r'<div[^>]*id="item-abstract"[^>]*>(.*?)</div>',
|
|
resp.text, re.DOTALL,
|
|
)
|
|
if scope_match:
|
|
scope = re.sub(r'<[^>]+>', '', scope_match.group(1)).strip()
|
|
if len(scope) > 30:
|
|
return scope[:5000]
|
|
time_mod.sleep(0.5)
|
|
except httpx.HTTPError:
|
|
pass
|
|
return None
|
|
|
|
def close(self) -> None:
|
|
self.client.close()
|