ietf-draft-analyzer/src/ietf_analyzer/authors.py

"""Author network — fetch authors from Datatracker, build collaboration graph."""

from __future__ import annotations

import time as time_mod
from datetime import datetime, timezone

import httpx
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn

from .config import Config
from .db import Database
from .models import Author

API_BASE = "https://datatracker.ietf.org/api/v1"

console = Console()


class AuthorNetwork:
    def __init__(self, config: Config | None = None, db: Database | None = None):
        self.config = config or Config.load()
        self.db = db or Database(self.config)
        self.client = httpx.Client(timeout=30, follow_redirects=True)
        self._person_cache: dict[int, Author] = {}

    def close(self) -> None:
        self.client.close()

    def _extract_person_id(self, person_uri: str) -> int | None:
        """Extract person_id from a URI like /api/v1/person/person/12345/."""
        if not person_uri:
            return None
        parts = person_uri.strip("/").split("/")
        try:
            return int(parts[-1])
        except (ValueError, IndexError):
            return None

    def fetch_person(self, person_id: int) -> Author | None:
        """Fetch a person's details from Datatracker."""
        if person_id in self._person_cache:
            return self._person_cache[person_id]

        try:
            resp = self.client.get(
                f"{API_BASE}/person/person/{person_id}/",
                params={"format": "json"},
            )
            resp.raise_for_status()
            data = resp.json()
            author = Author(
                person_id=person_id,
                name=data.get("name", ""),
                ascii_name=data.get("ascii", ""),
                affiliation="",  # Will be set from documentauthor
                resource_uri=data.get("resource_uri", ""),
                fetched_at=datetime.now(timezone.utc).isoformat(),
            )
            self._person_cache[person_id] = author
            time_mod.sleep(self.config.fetch_delay)
            return author
        except httpx.HTTPError as e:
            console.print(f"[dim]Could not fetch person {person_id}: {e}[/]")
            return None

    def fetch_authors_for_draft(self, draft_name: str) -> list[tuple[Author, int, str]]:
        """Fetch authors for a single draft. Returns [(Author, order, affiliation)]."""
        try:
            resp = self.client.get(
                f"{API_BASE}/doc/documentauthor/",
                params={"document__name": draft_name, "format": "json", "limit": 50},
            )
            resp.raise_for_status()
            data = resp.json()
        except httpx.HTTPError as e:
            console.print(f"[dim]Could not fetch authors for {draft_name}: {e}[/]")
            return []

        results: list[tuple[Author, int, str]] = []
        for obj in data.get("objects", []):
            person_uri = obj.get("person", "")
            person_id = self._extract_person_id(person_uri)
            if person_id is None:
                continue

            affiliation = obj.get("affiliation", "")
            order = obj.get("order", 1)

            author = self.fetch_person(person_id)
            if author is None:
                continue

            # Use the affiliation from the document author record
            author_with_aff = Author(
                person_id=author.person_id,
                name=author.name,
                ascii_name=author.ascii_name,
                affiliation=affiliation or author.affiliation,
                resource_uri=author.resource_uri,
                fetched_at=author.fetched_at,
            )
            results.append((author_with_aff, order, affiliation))

        time_mod.sleep(self.config.fetch_delay)
        return results

    def fetch_all_authors(self, limit: int = 500) -> int:
        """Fetch authors for all drafts missing author data."""
        missing = self.db.drafts_without_authors(limit=limit)
        if not missing:
            console.print("All drafts already have author data.")
            return 0

        count = 0
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            console=console,
        ) as progress:
            task = progress.add_task("Fetching authors...", total=len(missing))
            for draft_name in missing:
                progress.update(task, description=f"Authors: {draft_name.split('-')[-1][:15]}")
                authors = self.fetch_authors_for_draft(draft_name)
                for author, order, affiliation in authors:
                    self.db.upsert_author(author)
                    self.db.upsert_draft_author(draft_name, author.person_id, order, affiliation)
                if authors:
                    count += 1
                progress.advance(task)

        console.print(f"Fetched authors for [bold green]{count}[/] drafts "
                      f"({self.db.author_count()} unique authors)")
        return count