ietf-draft-analyzer/scripts/team-blocs-report.py

#!/usr/bin/env python3
"""Generate a detailed team blocs report.

Usage:
    PYTHONPATH=src python scripts/team-blocs-report.py
    PYTHONPATH=src python scripts/team-blocs-report.py --min-shared 3 --threshold 0.80
"""

import argparse
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path

from ietf_analyzer.config import Config
from ietf_analyzer.db import Database
from ietf_analyzer.orgs import detect_blocs, normalize_org, top_orgs_normalized


def main():
    parser = argparse.ArgumentParser(description="Team blocs report")
    parser.add_argument("--min-shared", type=int, default=2, help="Min shared drafts to form a bloc edge (default: 2)")
    parser.add_argument("--threshold", type=float, default=0.70, help="Cohesion threshold 0-1 (default: 0.70)")
    parser.add_argument("--min-size", type=int, default=2, help="Min members per bloc (default: 2)")
    parser.add_argument("-o", "--output", default=None, help="Output path (default: data/reports/team-blocs.md)")
    args = parser.parse_args()

    cfg = Config()
    db = Database(cfg)
    out = Path(args.output) if args.output else Path(cfg.data_dir) / "reports" / "team-blocs.md"

    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
    draft_sets = db.author_draft_sets()
    draft_counts = db.author_draft_counts()
    total_authors = db.author_count()
    total_drafts = db.count_drafts()

    # Build rating lookup for categories
    pairs_data = db.drafts_with_ratings(limit=500)
    rating_map = {draft.name: rating for draft, rating in pairs_data}

    blocs = detect_blocs(
        db,
        cohesion_threshold=args.threshold,
        min_size=args.min_size,
        min_shared_drafts=args.min_shared,
    )

    # Stats
    bloc_authors = set()
    for b in blocs:
        bloc_authors |= b.member_pids
    pct_in_blocs = len(bloc_authors) / total_authors * 100 if total_authors else 0

    lines = [
        "# Team Bloc Analysis",
        f"*Generated {now} — {total_authors} authors, {total_drafts} drafts*",
        f"*Parameters: cohesion >= {args.threshold:.0%}, min shared drafts >= {args.min_shared}, min size >= {args.min_size}*\n",
        "## Summary\n",
        f"| Metric | Value |",
        f"|--------|------:|",
        f"| Total blocs detected | {len(blocs)} |",
        f"| Authors in blocs | {len(bloc_authors)} / {total_authors} ({pct_in_blocs:.0f}%) |",
        f"| Largest bloc | {len(blocs[0].members)} members |" if blocs else "| Largest bloc | — |",
        f"| Most shared drafts | {blocs[0].shared_drafts} |" if blocs else "| Most shared drafts | — |",
        "",
    ]

    # Bloc size distribution
    size_dist: dict[str, int] = defaultdict(int)
    for b in blocs:
        n = len(b.members)
        if n >= 10:
            size_dist["10+"] += 1
        elif n >= 5:
            size_dist["5-9"] += 1
        elif n >= 3:
            size_dist["3-4"] += 1
        else:
            size_dist["2"] += 1

    lines.extend([
        "### Bloc Size Distribution\n",
        "| Size | Count |",
        "|------|------:|",
    ])
    for label in ["10+", "5-9", "3-4", "2"]:
        if label in size_dist:
            lines.append(f"| {label} members | {size_dist[label]} |")
    lines.append("")

    # Org breakdown
    org_blocs: dict[str, list] = defaultdict(list)
    for b in blocs:
        org_blocs[b.primary_org].append(b)

    lines.extend([
        "### Blocs by Organization\n",
        "| Organization | Blocs | Total Members | Total Shared Drafts |",
        "|-------------|------:|--------------:|--------------------:|",
    ])
    org_summary = sorted(org_blocs.items(), key=lambda x: -sum(b.shared_drafts for b in x[1]))
    for org, obs in org_summary:
        total_m = sum(len(b.members) for b in obs)
        total_s = sum(b.shared_drafts for b in obs)
        lines.append(f"| {org} | {len(obs)} | {total_m} | {total_s} |")
    lines.append("")

    # Detailed blocs
    lines.extend([
        "---\n",
        "## Detailed Bloc Profiles\n",
    ])

    for i, bloc in enumerate(blocs):
        lines.append(f"### {i + 1}. {bloc.label}")
        lines.append("")
        lines.append(f"| | |")
        lines.append(f"|---|---|")
        lines.append(f"| **Members** | {len(bloc.members)} |")
        lines.append(f"| **Shared Drafts** | {bloc.shared_drafts} |")
        lines.append(f"| **Cohesion** | {bloc.cohesion:.0%} |")
        lines.append(f"| **Primary Org** | {bloc.primary_org} |")

        # List orgs if multi-org
        orgs = set(org for _, _, org in bloc.members if org)
        if len(orgs) > 1:
            lines.append(f"| **All Orgs** | {', '.join(sorted(orgs))} |")
        lines.append("")

        # Member table
        lines.append("**Members:**\n")
        lines.append("| Author | Organization | Drafts | In-Bloc Drafts |")
        lines.append("|--------|-------------|-------:|--------------:|")
        for pid, name, org in bloc.members:
            total_d = draft_counts.get(pid, 0)
            my_drafts = draft_sets.get(pid, set())
            # Count how many of this person's drafts have another bloc member
            bloc_other_pids = bloc.member_pids - {pid}
            in_bloc = sum(
                1 for d in my_drafts
                if any(d in draft_sets.get(other, set()) for other in bloc_other_pids)
            )
            lines.append(f"| {name} | {org} | {total_d} | {in_bloc} |")
        lines.append("")

        # Shared drafts list
        all_drafts: dict[str, int] = defaultdict(int)
        for pid in bloc.member_pids:
            for d in draft_sets.get(pid, set()):
                all_drafts[d] += 1
        shared_list = sorted(
            [(d, cnt) for d, cnt in all_drafts.items() if cnt >= 2],
            key=lambda x: -x[1],
        )

        if shared_list:
            lines.append("**Shared Drafts:**\n")
            lines.append("| Draft | Co-authors | Score | Categories |")
            lines.append("|-------|----------:|------:|------------|")
            for d, cnt in shared_list:
                r = rating_map.get(d)
                score = f"{r.composite_score:.1f}" if r else "—"
                cats = ", ".join(r.categories[:2]) if r else ""
                lines.append(
                    f"| [{d}](https://datatracker.ietf.org/doc/{d}/) "
                    f"| {cnt}/{len(bloc.members)} | {score} | {cats} |"
                )
            lines.append("")

        lines.append("---\n")

    # Cross-bloc connections
    lines.extend([
        "## Cross-Bloc Connections\n",
        "*Authors who bridge between different blocs or connect blocs to the wider community.*\n",
    ])

    # Find authors in blocs who also collaborate with people outside their bloc
    coauthor_rows = db.conn.execute(
        """SELECT a1.name, da1.person_id, a2.name, da2.person_id, COUNT(*) as shared
        FROM draft_authors da1
        JOIN draft_authors da2 ON da1.draft_name = da2.draft_name
            AND da1.person_id < da2.person_id
        JOIN authors a1 ON da1.person_id = a1.person_id
        JOIN authors a2 ON da2.person_id = a2.person_id
        GROUP BY da1.person_id, da2.person_id
        HAVING shared >= 2
        ORDER BY shared DESC"""
    ).fetchall()

    # Map pid -> bloc index
    pid_bloc: dict[int, int] = {}
    for bi, b in enumerate(blocs):
        for pid in b.member_pids:
            pid_bloc[pid] = bi

    bridges = []
    for r in coauthor_rows:
        _, pid_a, _, pid_b, shared = r[0], r[1], r[2], r[3], r[4]
        bloc_a = pid_bloc.get(pid_a)
        bloc_b = pid_bloc.get(pid_b)
        # One in a bloc, other not — or in different blocs
        if bloc_a is not None and bloc_b is not None and bloc_a != bloc_b:
            bridges.append((r[0], blocs[bloc_a].label, r[2], blocs[bloc_b].label, shared))
        elif bloc_a is not None and bloc_b is None:
            bridges.append((r[0], blocs[bloc_a].label, r[2], "(independent)", shared))
        elif bloc_b is not None and bloc_a is None:
            bridges.append((r[2], blocs[bloc_b].label, r[0], "(independent)", shared))

    if bridges:
        lines.append("| Bloc Author | Bloc | External Author | Their Affiliation | Shared |")
        lines.append("|-------------|------|-----------------|-------------------|-------:|")
        for a, bloc_label, b, other_label, shared in bridges[:30]:
            lines.append(f"| {a} | {bloc_label} | {b} | {other_label} | {shared} |")
    else:
        lines.append("No cross-bloc connections found with >= 2 shared drafts.")

    lines.append("")

    report = "\n".join(lines)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(report)
    db.close()
    print(f"Report written to {out} ({len(blocs)} blocs)")


if __name__ == "__main__":
    main()