Observatory update: 434 docs, fix W3C fetcher, regenerate dashboard
- Fixed W3C fetcher to paginate /specifications endpoint (group endpoints use type prefixes like cg/, wg/ that weren't in config) - Fetched 72 new IETF drafts + 1 W3C spec, all analyzed and embedded - Regenerated dashboard with updated data - Total: 434 docs, 11 gaps, 1907 ideas Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,23 @@ W3C_API = "https://api.w3.org"
|
||||
|
||||
console = Console()
|
||||
|
||||
# AI/agent-relevant W3C groups with their full API type prefix
|
||||
W3C_AI_GROUPS = {
|
||||
"webagents": "cg/webagents", # Autonomous Agents on the Web
|
||||
"agentprotocol": "cg/agentprotocol", # AI Agent Protocol
|
||||
"agentic-arbitration": "cg/agentic-arbitration", # Agentic Arbitration Protocol
|
||||
"ai-content-disclosure": "cg/ai-content-disclosure",
|
||||
"aikr": "cg/aikr", # AI Knowledge Representation
|
||||
"ai-web-visibility": "cg/ai-web-visibility",
|
||||
"aiwss": "cg/aiwss", # AI-Driven Web Standards
|
||||
"cogai": "cg/cogai", # Cognitive AI
|
||||
"credentials": "cg/credentials", # Credentials CG
|
||||
"did": "wg/did", # Decentralized Identifier WG
|
||||
"vc": "wg/vc", # Verifiable Credentials WG
|
||||
"webmachinelearning": "wg/webmachinelearning", # Web Machine Learning WG
|
||||
"wot": "wg/wot", # Web of Things WG
|
||||
}
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
"""Minimal HTML tag stripper — no heavy dependencies."""
|
||||
@@ -38,20 +55,31 @@ def _strip_html(html: str) -> str:
|
||||
|
||||
|
||||
class W3CFetcher:
|
||||
"""Fetch specs from the W3C public API (no auth needed)."""
|
||||
"""Fetch specs from the W3C public API (no auth needed).
|
||||
|
||||
Two strategies:
|
||||
1. Browse /specifications paginated, filter client-side by keywords
|
||||
2. Fetch specs linked from known AI-relevant groups
|
||||
Both are combined for maximum coverage.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Config | None = None):
|
||||
self.config = config or Config.load()
|
||||
self.client = httpx.Client(timeout=30, follow_redirects=True)
|
||||
self.groups = self.config.w3c_groups
|
||||
|
||||
def search(
|
||||
self, keywords: list[str], since: str | None = None
|
||||
) -> list[SourceDocument]:
|
||||
"""Fetch specs from AI-relevant W3C groups, filtered by keywords."""
|
||||
"""Fetch AI-relevant W3C specs via keyword search over /specifications."""
|
||||
seen: dict[str, SourceDocument] = {}
|
||||
kw_lower = [k.lower() for k in keywords]
|
||||
|
||||
# Strategy 1: Paginate through /specifications, filter client-side
|
||||
console.print(" Searching W3C specifications catalog...")
|
||||
page = 1
|
||||
max_pages = 17 # ~1679 specs total
|
||||
checked = 0
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
@@ -59,93 +87,72 @@ class W3CFetcher:
|
||||
MofNCompleteColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Fetching W3C specs...", total=len(self.groups))
|
||||
task = progress.add_task("W3C catalog...", total=max_pages)
|
||||
|
||||
for group in self.groups:
|
||||
progress.update(task, description=f"W3C group: {group}")
|
||||
specs = self._fetch_group_specs(group)
|
||||
for spec in specs:
|
||||
# Client-side keyword filter on title + description
|
||||
haystack = (spec.title + " " + spec.abstract).lower()
|
||||
if any(kw in haystack for kw in kw_lower):
|
||||
if since and spec.time and spec.time < since:
|
||||
continue
|
||||
if spec.name not in seen:
|
||||
seen[spec.name] = spec
|
||||
progress.advance(task)
|
||||
while page <= max_pages:
|
||||
progress.update(task, description=f"W3C catalog page {page}/{max_pages}")
|
||||
try:
|
||||
resp = self.client.get(
|
||||
f"{W3C_API}/specifications",
|
||||
params={"format": "json", "page": page},
|
||||
headers={"Accept": "application/json"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except httpx.HTTPError as e:
|
||||
console.print(f"[yellow]W3C API error on page {page}: {e}[/]")
|
||||
break
|
||||
|
||||
console.print(f"Found [bold green]{len(seen)}[/] W3C specs matching keywords")
|
||||
return list(seen.values())
|
||||
|
||||
def _fetch_group_specs(self, group_shortname: str) -> list[SourceDocument]:
|
||||
"""Fetch all specifications for a W3C group."""
|
||||
url = f"{W3C_API}/groups/{group_shortname}/specifications"
|
||||
specs: list[SourceDocument] = []
|
||||
|
||||
try:
|
||||
page = 1
|
||||
while True:
|
||||
resp = self.client.get(
|
||||
url,
|
||||
params={"format": "json", "page": page},
|
||||
headers={"Accept": "application/json"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
spec_list = data if isinstance(data, list) else data.get("_links", {}).get("specifications", [])
|
||||
if not spec_list:
|
||||
# Try alternate response shape
|
||||
spec_list = data.get("specifications", [])
|
||||
spec_list = data.get("_links", {}).get("specifications", [])
|
||||
if not spec_list:
|
||||
break
|
||||
|
||||
max_pages = data.get("pages", max_pages)
|
||||
progress.update(task, total=max_pages)
|
||||
|
||||
for item in spec_list:
|
||||
title = item.get("title", "")
|
||||
href = item.get("href", "")
|
||||
shortname = item.get("shortname", "")
|
||||
title = item.get("title", shortname)
|
||||
|
||||
if not shortname and href:
|
||||
# Extract shortname from href like /specifications/webnn
|
||||
parts = href.rstrip("/").split("/")
|
||||
shortname = parts[-1] if parts else ""
|
||||
|
||||
shortname = href.rstrip("/").split("/")[-1] if href else ""
|
||||
if not shortname:
|
||||
continue
|
||||
checked += 1
|
||||
|
||||
# Fetch spec detail for abstract/description
|
||||
# Quick keyword check on title
|
||||
if not any(kw in title.lower() for kw in kw_lower):
|
||||
continue
|
||||
|
||||
# Fetch detail for description, URL, dates
|
||||
detail = self._fetch_spec_detail(shortname)
|
||||
abstract = detail.get("description", title)
|
||||
spec_url = detail.get("editor-draft", detail.get("url", f"https://www.w3.org/TR/{shortname}/"))
|
||||
status = detail.get("status", "")
|
||||
abstract = _strip_html(detail.get("description", title))
|
||||
spec_url = detail.get("editor-draft") or detail.get("url") or f"https://www.w3.org/TR/{shortname}/"
|
||||
date = detail.get("date", "")
|
||||
status = detail.get("status", "")
|
||||
|
||||
specs.append(
|
||||
SourceDocument(
|
||||
name=f"w3c-{shortname}",
|
||||
title=title,
|
||||
if since and date and date < since:
|
||||
continue
|
||||
|
||||
name = f"w3c-{shortname}"
|
||||
if name not in seen:
|
||||
seen[name] = SourceDocument(
|
||||
name=name,
|
||||
title=detail.get("title", title),
|
||||
abstract=abstract,
|
||||
source="w3c",
|
||||
source_id=shortname,
|
||||
source_url=spec_url,
|
||||
time=date,
|
||||
doc_status=status,
|
||||
extra={"group": group_shortname},
|
||||
)
|
||||
)
|
||||
time_mod.sleep(0.3)
|
||||
|
||||
# Check pagination
|
||||
pages = data.get("pages", 1) if isinstance(data, dict) else 1
|
||||
if page >= pages:
|
||||
break
|
||||
time_mod.sleep(0.15)
|
||||
|
||||
progress.advance(task)
|
||||
page += 1
|
||||
time_mod.sleep(0.3)
|
||||
time_mod.sleep(0.2)
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
console.print(f"[yellow]W3C API error for {group_shortname}: {e}[/]")
|
||||
|
||||
return specs
|
||||
console.print(f" Checked {checked} W3C specs, found [bold green]{len(seen)}[/] matching keywords")
|
||||
return list(seen.values())
|
||||
|
||||
def _fetch_spec_detail(self, shortname: str) -> dict:
|
||||
"""Fetch detail for a single spec."""
|
||||
@@ -156,13 +163,14 @@ class W3CFetcher:
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
latest = data.get("_links", {}).get("latest-version", {})
|
||||
return {
|
||||
"description": data.get("description", ""),
|
||||
"title": data.get("title", shortname),
|
||||
"editor-draft": data.get("editor-draft", ""),
|
||||
"url": data.get("_links", {}).get("latest-version", {}).get("href", ""),
|
||||
"status": data.get("_links", {}).get("latest-version", {}).get("status", ""),
|
||||
"date": data.get("_links", {}).get("latest-version", {}).get("date", ""),
|
||||
"url": latest.get("href", ""),
|
||||
"status": latest.get("title", ""),
|
||||
"date": latest.get("href", "").rstrip("/").split("/")[-1] if latest.get("href") else "",
|
||||
}
|
||||
except httpx.HTTPError:
|
||||
return {}
|
||||
|
||||
Reference in New Issue
Block a user