Observatory update: 434 docs, fix W3C fetcher, regenerate dashboard

- Fixed W3C fetcher to paginate /specifications endpoint (group endpoints use type prefixes like cg/, wg/ that weren't in config) - Fetched 72 new IETF drafts + 1 W3C spec, all analyzed and embedded - Regenerated dashboard with updated data - Total: 434 docs, 11 gaps, 1907 ideas Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 01:09:30 +01:00
parent d6beb9c0a0
commit 7a1aa346b9
7 changed files with 2482 additions and 275 deletions
--- a/src/ietf_analyzer/sources/w3c.py
+++ b/src/ietf_analyzer/sources/w3c.py
@@ -22,6 +22,23 @@ W3C_API = "https://api.w3.org"

 console = Console()

+# AI/agent-relevant W3C groups with their full API type prefix
+W3C_AI_GROUPS = {
+    "webagents": "cg/webagents",                     # Autonomous Agents on the Web
+    "agentprotocol": "cg/agentprotocol",             # AI Agent Protocol
+    "agentic-arbitration": "cg/agentic-arbitration",  # Agentic Arbitration Protocol
+    "ai-content-disclosure": "cg/ai-content-disclosure",
+    "aikr": "cg/aikr",                               # AI Knowledge Representation
+    "ai-web-visibility": "cg/ai-web-visibility",
+    "aiwss": "cg/aiwss",                             # AI-Driven Web Standards
+    "cogai": "cg/cogai",                             # Cognitive AI
+    "credentials": "cg/credentials",                 # Credentials CG
+    "did": "wg/did",                                 # Decentralized Identifier WG
+    "vc": "wg/vc",                                   # Verifiable Credentials WG
+    "webmachinelearning": "wg/webmachinelearning",   # Web Machine Learning WG
+    "wot": "wg/wot",                                 # Web of Things WG
+}
+

 def _strip_html(html: str) -> str:
    """Minimal HTML tag stripper — no heavy dependencies."""
@@ -38,20 +55,31 @@ def _strip_html(html: str) -> str:


 class W3CFetcher:
-    """Fetch specs from the W3C public API (no auth needed)."""
+    """Fetch specs from the W3C public API (no auth needed).
+
+    Two strategies:
+    1. Browse /specifications paginated, filter client-side by keywords
+    2. Fetch specs linked from known AI-relevant groups
+    Both are combined for maximum coverage.
+    """

    def __init__(self, config: Config | None = None):
        self.config = config or Config.load()
        self.client = httpx.Client(timeout=30, follow_redirects=True)
-        self.groups = self.config.w3c_groups

    def search(
        self, keywords: list[str], since: str | None = None
    ) -> list[SourceDocument]:
-        """Fetch specs from AI-relevant W3C groups, filtered by keywords."""
+        """Fetch AI-relevant W3C specs via keyword search over /specifications."""
        seen: dict[str, SourceDocument] = {}
        kw_lower = [k.lower() for k in keywords]

+        # Strategy 1: Paginate through /specifications, filter client-side
+        console.print("  Searching W3C specifications catalog...")
+        page = 1
+        max_pages = 17  # ~1679 specs total
+        checked = 0
+
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
@@ -59,93 +87,72 @@ class W3CFetcher:
            MofNCompleteColumn(),
            console=console,
        ) as progress:
-            task = progress.add_task("Fetching W3C specs...", total=len(self.groups))
+            task = progress.add_task("W3C catalog...", total=max_pages)

-            for group in self.groups:
-                progress.update(task, description=f"W3C group: {group}")
-                specs = self._fetch_group_specs(group)
-                for spec in specs:
-                    # Client-side keyword filter on title + description
-                    haystack = (spec.title + " " + spec.abstract).lower()
-                    if any(kw in haystack for kw in kw_lower):
-                        if since and spec.time and spec.time < since:
-                            continue
-                        if spec.name not in seen:
-                            seen[spec.name] = spec
-                progress.advance(task)
+            while page <= max_pages:
+                progress.update(task, description=f"W3C catalog page {page}/{max_pages}")
+                try:
+                    resp = self.client.get(
+                        f"{W3C_API}/specifications",
+                        params={"format": "json", "page": page},
+                        headers={"Accept": "application/json"},
+                    )
+                    resp.raise_for_status()
+                    data = resp.json()
+                except httpx.HTTPError as e:
+                    console.print(f"[yellow]W3C API error on page {page}: {e}[/]")
+                    break

-        console.print(f"Found [bold green]{len(seen)}[/] W3C specs matching keywords")
-        return list(seen.values())
-
-    def _fetch_group_specs(self, group_shortname: str) -> list[SourceDocument]:
-        """Fetch all specifications for a W3C group."""
-        url = f"{W3C_API}/groups/{group_shortname}/specifications"
-        specs: list[SourceDocument] = []
-
-        try:
-            page = 1
-            while True:
-                resp = self.client.get(
-                    url,
-                    params={"format": "json", "page": page},
-                    headers={"Accept": "application/json"},
-                )
-                resp.raise_for_status()
-                data = resp.json()
-
-                spec_list = data if isinstance(data, list) else data.get("_links", {}).get("specifications", [])
-                if not spec_list:
-                    # Try alternate response shape
-                    spec_list = data.get("specifications", [])
+                spec_list = data.get("_links", {}).get("specifications", [])
                if not spec_list:
                    break

+                max_pages = data.get("pages", max_pages)
+                progress.update(task, total=max_pages)
+
                for item in spec_list:
+                    title = item.get("title", "")
                    href = item.get("href", "")
-                    shortname = item.get("shortname", "")
-                    title = item.get("title", shortname)
-
-                    if not shortname and href:
-                        # Extract shortname from href like /specifications/webnn
-                        parts = href.rstrip("/").split("/")
-                        shortname = parts[-1] if parts else ""
-
+                    shortname = href.rstrip("/").split("/")[-1] if href else ""
                    if not shortname:
                        continue
+                    checked += 1

-                    # Fetch spec detail for abstract/description
+                    # Quick keyword check on title
+                    if not any(kw in title.lower() for kw in kw_lower):
+                        continue
+
+                    # Fetch detail for description, URL, dates
                    detail = self._fetch_spec_detail(shortname)
-                    abstract = detail.get("description", title)
-                    spec_url = detail.get("editor-draft", detail.get("url", f"https://www.w3.org/TR/{shortname}/"))
-                    status = detail.get("status", "")
+                    abstract = _strip_html(detail.get("description", title))
+                    spec_url = detail.get("editor-draft") or detail.get("url") or f"https://www.w3.org/TR/{shortname}/"
                    date = detail.get("date", "")
+                    status = detail.get("status", "")

-                    specs.append(
-                        SourceDocument(
-                            name=f"w3c-{shortname}",
-                            title=title,
+                    if since and date and date < since:
+                        continue
+
+                    name = f"w3c-{shortname}"
+                    if name not in seen:
+                        seen[name] = SourceDocument(
+                            name=name,
+                            title=detail.get("title", title),
                            abstract=abstract,
                            source="w3c",
                            source_id=shortname,
                            source_url=spec_url,
                            time=date,
                            doc_status=status,
-                            extra={"group": group_shortname},
                        )
-                    )
-                    time_mod.sleep(0.3)

-                # Check pagination
-                pages = data.get("pages", 1) if isinstance(data, dict) else 1
-                if page >= pages:
-                    break
+                    time_mod.sleep(0.15)
+
+                progress.advance(task)
                page += 1
-                time_mod.sleep(0.3)
+                time_mod.sleep(0.2)

-        except httpx.HTTPError as e:
-            console.print(f"[yellow]W3C API error for {group_shortname}: {e}[/]")
-
-        return specs
+        console.print(f"  Checked {checked} W3C specs, found [bold green]{len(seen)}[/] matching keywords")
+        return list(seen.values())

    def _fetch_spec_detail(self, shortname: str) -> dict:
        """Fetch detail for a single spec."""
@@ -156,13 +163,14 @@ class W3CFetcher:
            )
            resp.raise_for_status()
            data = resp.json()
+            latest = data.get("_links", {}).get("latest-version", {})
            return {
                "description": data.get("description", ""),
                "title": data.get("title", shortname),
                "editor-draft": data.get("editor-draft", ""),
-                "url": data.get("_links", {}).get("latest-version", {}).get("href", ""),
-                "status": data.get("_links", {}).get("latest-version", {}).get("status", ""),
-                "date": data.get("_links", {}).get("latest-version", {}).get("date", ""),
+                "url": latest.get("href", ""),
+                "status": latest.get("title", ""),
+                "date": latest.get("href", "").rstrip("/").split("/")[-1] if latest.get("href") else "",
            }
        except httpx.HTTPError:
            return {}