"""Datatracker API client — search, fetch metadata, download full text.""" from __future__ import annotations import time as time_mod from datetime import datetime, timezone import httpx from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn from .config import Config from .models import Draft API_BASE = "https://datatracker.ietf.org/api/v1" TEXT_BASE = "https://www.ietf.org/archive/id" SEARCH_FIELDS = ("name__contains", "abstract__contains") console = Console() class Fetcher: def __init__(self, config: Config | None = None): self.config = config or Config.load() self.client = httpx.Client(timeout=30, follow_redirects=True) self._group_cache: dict[str, str] = {} def close(self) -> None: self.client.close() # --- Search & fetch metadata --- def search_drafts( self, keywords: list[str] | None = None, since: str | None = None, limit_per_keyword: int = 200, ) -> list[Draft]: """Search for drafts matching keywords. Deduplicates by name.""" keywords = keywords or self.config.search_keywords since = since or self.config.fetch_since seen: dict[str, Draft] = {} with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), console=console, ) as progress: # Search both name and abstract for each keyword searches = [] for kw in keywords: for field in SEARCH_FIELDS: searches.append((kw, field)) task = progress.add_task("Searching Datatracker...", total=len(searches)) for kw, search_field in searches: progress.update(task, description=f"Searching {search_field.split('__')[0]}: {kw}") drafts = self._paginated_search(search_field, kw, since, limit_per_keyword) for d in drafts: if d.name not in seen: seen[d.name] = d progress.advance(task) console.print(f"Found [bold green]{len(seen)}[/] unique drafts") return list(seen.values()) def _paginated_search( self, search_field: str, keyword: str, since: str, max_results: int, ) -> list[Draft]: results: list[Draft] = [] offset = 0 page_size = 100 while offset < max_results: params = { "format": "json", search_field: keyword, "time__gte": since, "type__slug": "draft", "limit": min(page_size, max_results - offset), "offset": offset, } try: resp = self.client.get(f"{API_BASE}/doc/document/", params=params) resp.raise_for_status() except httpx.HTTPError as e: console.print(f"[red]API error: {e}[/]") break data = resp.json() objects = data.get("objects", []) if not objects: break for obj in objects: results.append(self._api_obj_to_draft(obj)) offset += len(objects) if not data.get("meta", {}).get("next"): break time_mod.sleep(self.config.fetch_delay) return results def fetch_draft(self, name: str) -> Draft | None: """Fetch a single draft by name.""" try: resp = self.client.get( f"{API_BASE}/doc/document/{name}/", params={"format": "json"} ) resp.raise_for_status() return self._api_obj_to_draft(resp.json()) except httpx.HTTPError as e: console.print(f"[red]Error fetching {name}: {e}[/]") return None # --- Full text --- def download_full_text(self, draft: Draft) -> str | None: """Download the plain text of a draft.""" url = draft.text_url try: resp = self.client.get(url) resp.raise_for_status() return resp.text except httpx.HTTPError: # Try without revision if it fails try: alt_url = f"{TEXT_BASE}/{draft.name}.txt" resp = self.client.get(alt_url) resp.raise_for_status() return resp.text except httpx.HTTPError as e: console.print(f"[dim]Could not download text for {draft.name}: {e}[/]") return None def download_texts(self, drafts: list[Draft]) -> dict[str, str]: """Download full text for multiple drafts. Returns {name: text}.""" results: dict[str, str] = {} with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), console=console, ) as progress: task = progress.add_task("Downloading draft texts...", total=len(drafts)) for draft in drafts: text = self.download_full_text(draft) if text: results[draft.name] = text progress.advance(task) time_mod.sleep(self.config.fetch_delay) console.print(f"Downloaded [bold green]{len(results)}[/] / {len(drafts)} texts") return results # --- Group resolution --- def resolve_group(self, group_uri: str) -> str: """Resolve a group API URI to a group acronym/name.""" if not group_uri: return "" if group_uri in self._group_cache: return self._group_cache[group_uri] try: resp = self.client.get( f"https://datatracker.ietf.org{group_uri}", params={"format": "json"} ) resp.raise_for_status() name = resp.json().get("acronym", resp.json().get("name", "")) self._group_cache[group_uri] = name time_mod.sleep(self.config.fetch_delay) return name except httpx.HTTPError: return "" # --- Helpers --- def _api_obj_to_draft(self, obj: dict) -> Draft: return Draft( name=obj.get("name", ""), rev=obj.get("rev", "00"), title=obj.get("title", ""), abstract=obj.get("abstract", "").strip(), time=obj.get("time", ""), dt_id=obj.get("id"), pages=obj.get("pages"), words=obj.get("words"), group=None, # Resolved lazily group_uri=obj.get("group", ""), expires=obj.get("expires"), ad=obj.get("ad"), shepherd=obj.get("shepherd"), states=[s for s in (obj.get("states") or []) if isinstance(s, str)], fetched_at=datetime.now(timezone.utc).isoformat(), )