ietf-draft-analyzer/src/ietf_analyzer/fetcher.py

"""Datatracker API client — search, fetch metadata, download full text."""

from __future__ import annotations

import time as time_mod
from datetime import datetime, timezone

import httpx
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn

from .config import Config
from .models import Draft

API_BASE = "https://datatracker.ietf.org/api/v1"
TEXT_BASE = "https://www.ietf.org/archive/id"
SEARCH_FIELDS = ("name__contains", "abstract__contains")

console = Console()


class Fetcher:
    def __init__(self, config: Config | None = None):
        self.config = config or Config.load()
        self.client = httpx.Client(timeout=30, follow_redirects=True)
        self._group_cache: dict[str, str] = {}

    def close(self) -> None:
        self.client.close()

    # --- Search & fetch metadata ---

    def search_drafts(
        self,
        keywords: list[str] | None = None,
        since: str | None = None,
        limit_per_keyword: int = 200,
    ) -> list[Draft]:
        """Search for drafts matching keywords. Deduplicates by name."""
        keywords = keywords or self.config.search_keywords
        since = since or self.config.fetch_since
        seen: dict[str, Draft] = {}

        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            console=console,
        ) as progress:
            # Search both name and abstract for each keyword
            searches = []
            for kw in keywords:
                for field in SEARCH_FIELDS:
                    searches.append((kw, field))

            task = progress.add_task("Searching Datatracker...", total=len(searches))

            for kw, search_field in searches:
                progress.update(task, description=f"Searching {search_field.split('__')[0]}: {kw}")
                drafts = self._paginated_search(search_field, kw, since, limit_per_keyword)
                for d in drafts:
                    if d.name not in seen:
                        seen[d.name] = d
                progress.advance(task)

        console.print(f"Found [bold green]{len(seen)}[/] unique drafts")
        return list(seen.values())

    def _paginated_search(
        self,
        search_field: str,
        keyword: str,
        since: str,
        max_results: int,
    ) -> list[Draft]:
        results: list[Draft] = []
        offset = 0
        page_size = 100

        while offset < max_results:
            params = {
                "format": "json",
                search_field: keyword,
                "time__gte": since,
                "type__slug": "draft",
                "limit": min(page_size, max_results - offset),
                "offset": offset,
            }
            try:
                resp = self.client.get(f"{API_BASE}/doc/document/", params=params)
                resp.raise_for_status()
            except httpx.HTTPError as e:
                console.print(f"[red]API error: {e}[/]")
                break

            data = resp.json()
            objects = data.get("objects", [])
            if not objects:
                break

            for obj in objects:
                results.append(self._api_obj_to_draft(obj))

            offset += len(objects)
            if not data.get("meta", {}).get("next"):
                break

            time_mod.sleep(self.config.fetch_delay)

        return results

    def fetch_draft(self, name: str) -> Draft | None:
        """Fetch a single draft by name."""
        try:
            resp = self.client.get(
                f"{API_BASE}/doc/document/{name}/", params={"format": "json"}
            )
            resp.raise_for_status()
            return self._api_obj_to_draft(resp.json())
        except httpx.HTTPError as e:
            console.print(f"[red]Error fetching {name}: {e}[/]")
            return None

    # --- Full text ---

    def download_full_text(self, draft: Draft) -> str | None:
        """Download the plain text of a draft."""
        url = draft.text_url
        try:
            resp = self.client.get(url)
            resp.raise_for_status()
            return resp.text
        except httpx.HTTPError:
            # Try without revision if it fails
            try:
                alt_url = f"{TEXT_BASE}/{draft.name}.txt"
                resp = self.client.get(alt_url)
                resp.raise_for_status()
                return resp.text
            except httpx.HTTPError as e:
                console.print(f"[dim]Could not download text for {draft.name}: {e}[/]")
                return None

    def download_texts(self, drafts: list[Draft]) -> dict[str, str]:
        """Download full text for multiple drafts. Returns {name: text}."""
        results: dict[str, str] = {}
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            console=console,
        ) as progress:
            task = progress.add_task("Downloading draft texts...", total=len(drafts))
            for draft in drafts:
                text = self.download_full_text(draft)
                if text:
                    results[draft.name] = text
                progress.advance(task)
                time_mod.sleep(self.config.fetch_delay)
        console.print(f"Downloaded [bold green]{len(results)}[/] / {len(drafts)} texts")
        return results

    # --- Group resolution ---

    def resolve_group(self, group_uri: str) -> str:
        """Resolve a group API URI to a group acronym/name."""
        if not group_uri:
            return ""
        if group_uri in self._group_cache:
            return self._group_cache[group_uri]
        try:
            resp = self.client.get(
                f"https://datatracker.ietf.org{group_uri}", params={"format": "json"}
            )
            resp.raise_for_status()
            name = resp.json().get("acronym", resp.json().get("name", ""))
            self._group_cache[group_uri] = name
            time_mod.sleep(self.config.fetch_delay)
            return name
        except httpx.HTTPError:
            return ""

    # --- Helpers ---

    def _api_obj_to_draft(self, obj: dict) -> Draft:
        return Draft(
            name=obj.get("name", ""),
            rev=obj.get("rev", "00"),
            title=obj.get("title", ""),
            abstract=obj.get("abstract", "").strip(),
            time=obj.get("time", ""),
            dt_id=obj.get("id"),
            pages=obj.get("pages"),
            words=obj.get("words"),
            group=None,  # Resolved lazily
            group_uri=obj.get("group", ""),
            expires=obj.get("expires"),
            ad=obj.get("ad"),
            shepherd=obj.get("shepherd"),
            states=[s for s in (obj.get("states") or []) if isinstance(s, str)],
            fetched_at=datetime.now(timezone.utc).isoformat(),
        )