New features: - 12 interactive visualizations (ietf viz): t-SNE landscape, similarity heatmap, score distributions, timeline, bubble explorer, radar charts, author network graph, category treemap, quality vs overlap, org bar chart, ideas chart, and interactive draft browser - Interactive draft browser (browser.html): filterable by category, keyword, score sliders with sortable table and expandable detail rows - arXiv paper (paper/main.tex): 13-page manuscript with all findings - Gap analysis: 12 identified under-addressed areas - Author network: collaboration graph, org contributions, cross-org analysis - Draft generation from gaps (ietf draft-gen) - Auto-load .env for API keys (python-dotenv) New modules: visualize.py, authors.py, draftgen.py New reports: timeline, overlap-matrix, authors, gaps New deps: plotly, matplotlib, seaborn, scipy, scikit-learn, networkx, python-dotenv Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
138 lines
5.1 KiB
Python
138 lines
5.1 KiB
Python
"""Author network — fetch authors from Datatracker, build collaboration graph."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import time as time_mod
|
|
from datetime import datetime, timezone
|
|
|
|
import httpx
|
|
from rich.console import Console
|
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn
|
|
|
|
from .config import Config
|
|
from .db import Database
|
|
from .models import Author
|
|
|
|
API_BASE = "https://datatracker.ietf.org/api/v1"
|
|
|
|
console = Console()
|
|
|
|
|
|
class AuthorNetwork:
|
|
def __init__(self, config: Config | None = None, db: Database | None = None):
|
|
self.config = config or Config.load()
|
|
self.db = db or Database(self.config)
|
|
self.client = httpx.Client(timeout=30, follow_redirects=True)
|
|
self._person_cache: dict[int, Author] = {}
|
|
|
|
def close(self) -> None:
|
|
self.client.close()
|
|
|
|
def _extract_person_id(self, person_uri: str) -> int | None:
|
|
"""Extract person_id from a URI like /api/v1/person/person/12345/."""
|
|
if not person_uri:
|
|
return None
|
|
parts = person_uri.strip("/").split("/")
|
|
try:
|
|
return int(parts[-1])
|
|
except (ValueError, IndexError):
|
|
return None
|
|
|
|
def fetch_person(self, person_id: int) -> Author | None:
|
|
"""Fetch a person's details from Datatracker."""
|
|
if person_id in self._person_cache:
|
|
return self._person_cache[person_id]
|
|
|
|
try:
|
|
resp = self.client.get(
|
|
f"{API_BASE}/person/person/{person_id}/",
|
|
params={"format": "json"},
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
author = Author(
|
|
person_id=person_id,
|
|
name=data.get("name", ""),
|
|
ascii_name=data.get("ascii", ""),
|
|
affiliation="", # Will be set from documentauthor
|
|
resource_uri=data.get("resource_uri", ""),
|
|
fetched_at=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
self._person_cache[person_id] = author
|
|
time_mod.sleep(self.config.fetch_delay)
|
|
return author
|
|
except httpx.HTTPError as e:
|
|
console.print(f"[dim]Could not fetch person {person_id}: {e}[/]")
|
|
return None
|
|
|
|
def fetch_authors_for_draft(self, draft_name: str) -> list[tuple[Author, int, str]]:
|
|
"""Fetch authors for a single draft. Returns [(Author, order, affiliation)]."""
|
|
try:
|
|
resp = self.client.get(
|
|
f"{API_BASE}/doc/documentauthor/",
|
|
params={"document__name": draft_name, "format": "json", "limit": 50},
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
except httpx.HTTPError as e:
|
|
console.print(f"[dim]Could not fetch authors for {draft_name}: {e}[/]")
|
|
return []
|
|
|
|
results: list[tuple[Author, int, str]] = []
|
|
for obj in data.get("objects", []):
|
|
person_uri = obj.get("person", "")
|
|
person_id = self._extract_person_id(person_uri)
|
|
if person_id is None:
|
|
continue
|
|
|
|
affiliation = obj.get("affiliation", "")
|
|
order = obj.get("order", 1)
|
|
|
|
author = self.fetch_person(person_id)
|
|
if author is None:
|
|
continue
|
|
|
|
# Use the affiliation from the document author record
|
|
author_with_aff = Author(
|
|
person_id=author.person_id,
|
|
name=author.name,
|
|
ascii_name=author.ascii_name,
|
|
affiliation=affiliation or author.affiliation,
|
|
resource_uri=author.resource_uri,
|
|
fetched_at=author.fetched_at,
|
|
)
|
|
results.append((author_with_aff, order, affiliation))
|
|
|
|
time_mod.sleep(self.config.fetch_delay)
|
|
return results
|
|
|
|
def fetch_all_authors(self, limit: int = 500) -> int:
|
|
"""Fetch authors for all drafts missing author data."""
|
|
missing = self.db.drafts_without_authors(limit=limit)
|
|
if not missing:
|
|
console.print("All drafts already have author data.")
|
|
return 0
|
|
|
|
count = 0
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
MofNCompleteColumn(),
|
|
console=console,
|
|
) as progress:
|
|
task = progress.add_task("Fetching authors...", total=len(missing))
|
|
for draft_name in missing:
|
|
progress.update(task, description=f"Authors: {draft_name.split('-')[-1][:15]}")
|
|
authors = self.fetch_authors_for_draft(draft_name)
|
|
for author, order, affiliation in authors:
|
|
self.db.upsert_author(author)
|
|
self.db.upsert_draft_author(draft_name, author.person_id, order, affiliation)
|
|
if authors:
|
|
count += 1
|
|
progress.advance(task)
|
|
|
|
console.print(f"Fetched authors for [bold green]{count}[/] drafts "
|
|
f"({self.db.author_count()} unique authors)")
|
|
return count
|