Observatory update: 434 docs, fix W3C fetcher, regenerate dashboard

- Fixed W3C fetcher to paginate /specifications endpoint (group
  endpoints use type prefixes like cg/, wg/ that weren't in config)
- Fetched 72 new IETF drafts + 1 W3C spec, all analyzed and embedded
- Regenerated dashboard with updated data
- Total: 434 docs, 11 gaps, 1907 ideas

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-04 01:09:30 +01:00
parent d6beb9c0a0
commit 7a1aa346b9
7 changed files with 2482 additions and 275 deletions

View File

@@ -22,6 +22,23 @@ W3C_API = "https://api.w3.org"
console = Console()
# AI/agent-relevant W3C groups with their full API type prefix
W3C_AI_GROUPS = {
"webagents": "cg/webagents", # Autonomous Agents on the Web
"agentprotocol": "cg/agentprotocol", # AI Agent Protocol
"agentic-arbitration": "cg/agentic-arbitration", # Agentic Arbitration Protocol
"ai-content-disclosure": "cg/ai-content-disclosure",
"aikr": "cg/aikr", # AI Knowledge Representation
"ai-web-visibility": "cg/ai-web-visibility",
"aiwss": "cg/aiwss", # AI-Driven Web Standards
"cogai": "cg/cogai", # Cognitive AI
"credentials": "cg/credentials", # Credentials CG
"did": "wg/did", # Decentralized Identifier WG
"vc": "wg/vc", # Verifiable Credentials WG
"webmachinelearning": "wg/webmachinelearning", # Web Machine Learning WG
"wot": "wg/wot", # Web of Things WG
}
def _strip_html(html: str) -> str:
"""Minimal HTML tag stripper — no heavy dependencies."""
@@ -38,20 +55,31 @@ def _strip_html(html: str) -> str:
class W3CFetcher:
"""Fetch specs from the W3C public API (no auth needed)."""
"""Fetch specs from the W3C public API (no auth needed).
Two strategies:
1. Browse /specifications paginated, filter client-side by keywords
2. Fetch specs linked from known AI-relevant groups
Both are combined for maximum coverage.
"""
def __init__(self, config: Config | None = None):
self.config = config or Config.load()
self.client = httpx.Client(timeout=30, follow_redirects=True)
self.groups = self.config.w3c_groups
def search(
self, keywords: list[str], since: str | None = None
) -> list[SourceDocument]:
"""Fetch specs from AI-relevant W3C groups, filtered by keywords."""
"""Fetch AI-relevant W3C specs via keyword search over /specifications."""
seen: dict[str, SourceDocument] = {}
kw_lower = [k.lower() for k in keywords]
# Strategy 1: Paginate through /specifications, filter client-side
console.print(" Searching W3C specifications catalog...")
page = 1
max_pages = 17 # ~1679 specs total
checked = 0
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
@@ -59,93 +87,72 @@ class W3CFetcher:
MofNCompleteColumn(),
console=console,
) as progress:
task = progress.add_task("Fetching W3C specs...", total=len(self.groups))
task = progress.add_task("W3C catalog...", total=max_pages)
for group in self.groups:
progress.update(task, description=f"W3C group: {group}")
specs = self._fetch_group_specs(group)
for spec in specs:
# Client-side keyword filter on title + description
haystack = (spec.title + " " + spec.abstract).lower()
if any(kw in haystack for kw in kw_lower):
if since and spec.time and spec.time < since:
continue
if spec.name not in seen:
seen[spec.name] = spec
progress.advance(task)
while page <= max_pages:
progress.update(task, description=f"W3C catalog page {page}/{max_pages}")
try:
resp = self.client.get(
f"{W3C_API}/specifications",
params={"format": "json", "page": page},
headers={"Accept": "application/json"},
)
resp.raise_for_status()
data = resp.json()
except httpx.HTTPError as e:
console.print(f"[yellow]W3C API error on page {page}: {e}[/]")
break
console.print(f"Found [bold green]{len(seen)}[/] W3C specs matching keywords")
return list(seen.values())
def _fetch_group_specs(self, group_shortname: str) -> list[SourceDocument]:
"""Fetch all specifications for a W3C group."""
url = f"{W3C_API}/groups/{group_shortname}/specifications"
specs: list[SourceDocument] = []
try:
page = 1
while True:
resp = self.client.get(
url,
params={"format": "json", "page": page},
headers={"Accept": "application/json"},
)
resp.raise_for_status()
data = resp.json()
spec_list = data if isinstance(data, list) else data.get("_links", {}).get("specifications", [])
if not spec_list:
# Try alternate response shape
spec_list = data.get("specifications", [])
spec_list = data.get("_links", {}).get("specifications", [])
if not spec_list:
break
max_pages = data.get("pages", max_pages)
progress.update(task, total=max_pages)
for item in spec_list:
title = item.get("title", "")
href = item.get("href", "")
shortname = item.get("shortname", "")
title = item.get("title", shortname)
if not shortname and href:
# Extract shortname from href like /specifications/webnn
parts = href.rstrip("/").split("/")
shortname = parts[-1] if parts else ""
shortname = href.rstrip("/").split("/")[-1] if href else ""
if not shortname:
continue
checked += 1
# Fetch spec detail for abstract/description
# Quick keyword check on title
if not any(kw in title.lower() for kw in kw_lower):
continue
# Fetch detail for description, URL, dates
detail = self._fetch_spec_detail(shortname)
abstract = detail.get("description", title)
spec_url = detail.get("editor-draft", detail.get("url", f"https://www.w3.org/TR/{shortname}/"))
status = detail.get("status", "")
abstract = _strip_html(detail.get("description", title))
spec_url = detail.get("editor-draft") or detail.get("url") or f"https://www.w3.org/TR/{shortname}/"
date = detail.get("date", "")
status = detail.get("status", "")
specs.append(
SourceDocument(
name=f"w3c-{shortname}",
title=title,
if since and date and date < since:
continue
name = f"w3c-{shortname}"
if name not in seen:
seen[name] = SourceDocument(
name=name,
title=detail.get("title", title),
abstract=abstract,
source="w3c",
source_id=shortname,
source_url=spec_url,
time=date,
doc_status=status,
extra={"group": group_shortname},
)
)
time_mod.sleep(0.3)
# Check pagination
pages = data.get("pages", 1) if isinstance(data, dict) else 1
if page >= pages:
break
time_mod.sleep(0.15)
progress.advance(task)
page += 1
time_mod.sleep(0.3)
time_mod.sleep(0.2)
except httpx.HTTPError as e:
console.print(f"[yellow]W3C API error for {group_shortname}: {e}[/]")
return specs
console.print(f" Checked {checked} W3C specs, found [bold green]{len(seen)}[/] matching keywords")
return list(seen.values())
def _fetch_spec_detail(self, shortname: str) -> dict:
"""Fetch detail for a single spec."""
@@ -156,13 +163,14 @@ class W3CFetcher:
)
resp.raise_for_status()
data = resp.json()
latest = data.get("_links", {}).get("latest-version", {})
return {
"description": data.get("description", ""),
"title": data.get("title", shortname),
"editor-draft": data.get("editor-draft", ""),
"url": data.get("_links", {}).get("latest-version", {}).get("href", ""),
"status": data.get("_links", {}).get("latest-version", {}).get("status", ""),
"date": data.get("_links", {}).get("latest-version", {}).get("date", ""),
"url": latest.get("href", ""),
"status": latest.get("title", ""),
"date": latest.get("href", "").rstrip("/").split("/")[-1] if latest.get("href") else "",
}
except httpx.HTTPError:
return {}