v0.2.0: visualizations, interactive browser, arXiv paper, gap analysis
New features: - 12 interactive visualizations (ietf viz): t-SNE landscape, similarity heatmap, score distributions, timeline, bubble explorer, radar charts, author network graph, category treemap, quality vs overlap, org bar chart, ideas chart, and interactive draft browser - Interactive draft browser (browser.html): filterable by category, keyword, score sliders with sortable table and expandable detail rows - arXiv paper (paper/main.tex): 13-page manuscript with all findings - Gap analysis: 12 identified under-addressed areas - Author network: collaboration graph, org contributions, cross-org analysis - Draft generation from gaps (ietf draft-gen) - Auto-load .env for API keys (python-dotenv) New modules: visualize.py, authors.py, draftgen.py New reports: timeline, overlap-matrix, authors, gaps New deps: plotly, matplotlib, seaborn, scipy, scikit-learn, networkx, python-dotenv Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
137
src/ietf_analyzer/authors.py
Normal file
137
src/ietf_analyzer/authors.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""Author network — fetch authors from Datatracker, build collaboration graph."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time as time_mod
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import httpx
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn
|
||||
|
||||
from .config import Config
|
||||
from .db import Database
|
||||
from .models import Author
|
||||
|
||||
API_BASE = "https://datatracker.ietf.org/api/v1"
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
class AuthorNetwork:
|
||||
def __init__(self, config: Config | None = None, db: Database | None = None):
|
||||
self.config = config or Config.load()
|
||||
self.db = db or Database(self.config)
|
||||
self.client = httpx.Client(timeout=30, follow_redirects=True)
|
||||
self._person_cache: dict[int, Author] = {}
|
||||
|
||||
def close(self) -> None:
|
||||
self.client.close()
|
||||
|
||||
def _extract_person_id(self, person_uri: str) -> int | None:
|
||||
"""Extract person_id from a URI like /api/v1/person/person/12345/."""
|
||||
if not person_uri:
|
||||
return None
|
||||
parts = person_uri.strip("/").split("/")
|
||||
try:
|
||||
return int(parts[-1])
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
def fetch_person(self, person_id: int) -> Author | None:
|
||||
"""Fetch a person's details from Datatracker."""
|
||||
if person_id in self._person_cache:
|
||||
return self._person_cache[person_id]
|
||||
|
||||
try:
|
||||
resp = self.client.get(
|
||||
f"{API_BASE}/person/person/{person_id}/",
|
||||
params={"format": "json"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
author = Author(
|
||||
person_id=person_id,
|
||||
name=data.get("name", ""),
|
||||
ascii_name=data.get("ascii", ""),
|
||||
affiliation="", # Will be set from documentauthor
|
||||
resource_uri=data.get("resource_uri", ""),
|
||||
fetched_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
self._person_cache[person_id] = author
|
||||
time_mod.sleep(self.config.fetch_delay)
|
||||
return author
|
||||
except httpx.HTTPError as e:
|
||||
console.print(f"[dim]Could not fetch person {person_id}: {e}[/]")
|
||||
return None
|
||||
|
||||
def fetch_authors_for_draft(self, draft_name: str) -> list[tuple[Author, int, str]]:
|
||||
"""Fetch authors for a single draft. Returns [(Author, order, affiliation)]."""
|
||||
try:
|
||||
resp = self.client.get(
|
||||
f"{API_BASE}/doc/documentauthor/",
|
||||
params={"document__name": draft_name, "format": "json", "limit": 50},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except httpx.HTTPError as e:
|
||||
console.print(f"[dim]Could not fetch authors for {draft_name}: {e}[/]")
|
||||
return []
|
||||
|
||||
results: list[tuple[Author, int, str]] = []
|
||||
for obj in data.get("objects", []):
|
||||
person_uri = obj.get("person", "")
|
||||
person_id = self._extract_person_id(person_uri)
|
||||
if person_id is None:
|
||||
continue
|
||||
|
||||
affiliation = obj.get("affiliation", "")
|
||||
order = obj.get("order", 1)
|
||||
|
||||
author = self.fetch_person(person_id)
|
||||
if author is None:
|
||||
continue
|
||||
|
||||
# Use the affiliation from the document author record
|
||||
author_with_aff = Author(
|
||||
person_id=author.person_id,
|
||||
name=author.name,
|
||||
ascii_name=author.ascii_name,
|
||||
affiliation=affiliation or author.affiliation,
|
||||
resource_uri=author.resource_uri,
|
||||
fetched_at=author.fetched_at,
|
||||
)
|
||||
results.append((author_with_aff, order, affiliation))
|
||||
|
||||
time_mod.sleep(self.config.fetch_delay)
|
||||
return results
|
||||
|
||||
def fetch_all_authors(self, limit: int = 500) -> int:
|
||||
"""Fetch authors for all drafts missing author data."""
|
||||
missing = self.db.drafts_without_authors(limit=limit)
|
||||
if not missing:
|
||||
console.print("All drafts already have author data.")
|
||||
return 0
|
||||
|
||||
count = 0
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
MofNCompleteColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Fetching authors...", total=len(missing))
|
||||
for draft_name in missing:
|
||||
progress.update(task, description=f"Authors: {draft_name.split('-')[-1][:15]}")
|
||||
authors = self.fetch_authors_for_draft(draft_name)
|
||||
for author, order, affiliation in authors:
|
||||
self.db.upsert_author(author)
|
||||
self.db.upsert_draft_author(draft_name, author.person_id, order, affiliation)
|
||||
if authors:
|
||||
count += 1
|
||||
progress.advance(task)
|
||||
|
||||
console.print(f"Fetched authors for [bold green]{count}[/] drafts "
|
||||
f"({self.db.author_count()} unique authors)")
|
||||
return count
|
||||
Reference in New Issue
Block a user