#!/usr/bin/env python3 """Generate a detailed team blocs report. Usage: PYTHONPATH=src python scripts/team-blocs-report.py PYTHONPATH=src python scripts/team-blocs-report.py --min-shared 3 --threshold 0.80 """ import argparse from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from ietf_analyzer.config import Config from ietf_analyzer.db import Database from ietf_analyzer.orgs import detect_blocs, normalize_org, top_orgs_normalized def main(): parser = argparse.ArgumentParser(description="Team blocs report") parser.add_argument("--min-shared", type=int, default=2, help="Min shared drafts to form a bloc edge (default: 2)") parser.add_argument("--threshold", type=float, default=0.70, help="Cohesion threshold 0-1 (default: 0.70)") parser.add_argument("--min-size", type=int, default=2, help="Min members per bloc (default: 2)") parser.add_argument("-o", "--output", default=None, help="Output path (default: data/reports/team-blocs.md)") args = parser.parse_args() cfg = Config() db = Database(cfg) out = Path(args.output) if args.output else Path(cfg.data_dir) / "reports" / "team-blocs.md" now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") draft_sets = db.author_draft_sets() draft_counts = db.author_draft_counts() total_authors = db.author_count() total_drafts = db.count_drafts() # Build rating lookup for categories pairs_data = db.drafts_with_ratings(limit=500) rating_map = {draft.name: rating for draft, rating in pairs_data} blocs = detect_blocs( db, cohesion_threshold=args.threshold, min_size=args.min_size, min_shared_drafts=args.min_shared, ) # Stats bloc_authors = set() for b in blocs: bloc_authors |= b.member_pids pct_in_blocs = len(bloc_authors) / total_authors * 100 if total_authors else 0 lines = [ "# Team Bloc Analysis", f"*Generated {now} — {total_authors} authors, {total_drafts} drafts*", f"*Parameters: cohesion >= {args.threshold:.0%}, min shared drafts >= {args.min_shared}, min size >= {args.min_size}*\n", "## Summary\n", f"| Metric | Value |", f"|--------|------:|", f"| Total blocs detected | {len(blocs)} |", f"| Authors in blocs | {len(bloc_authors)} / {total_authors} ({pct_in_blocs:.0f}%) |", f"| Largest bloc | {len(blocs[0].members)} members |" if blocs else "| Largest bloc | — |", f"| Most shared drafts | {blocs[0].shared_drafts} |" if blocs else "| Most shared drafts | — |", "", ] # Bloc size distribution size_dist: dict[str, int] = defaultdict(int) for b in blocs: n = len(b.members) if n >= 10: size_dist["10+"] += 1 elif n >= 5: size_dist["5-9"] += 1 elif n >= 3: size_dist["3-4"] += 1 else: size_dist["2"] += 1 lines.extend([ "### Bloc Size Distribution\n", "| Size | Count |", "|------|------:|", ]) for label in ["10+", "5-9", "3-4", "2"]: if label in size_dist: lines.append(f"| {label} members | {size_dist[label]} |") lines.append("") # Org breakdown org_blocs: dict[str, list] = defaultdict(list) for b in blocs: org_blocs[b.primary_org].append(b) lines.extend([ "### Blocs by Organization\n", "| Organization | Blocs | Total Members | Total Shared Drafts |", "|-------------|------:|--------------:|--------------------:|", ]) org_summary = sorted(org_blocs.items(), key=lambda x: -sum(b.shared_drafts for b in x[1])) for org, obs in org_summary: total_m = sum(len(b.members) for b in obs) total_s = sum(b.shared_drafts for b in obs) lines.append(f"| {org} | {len(obs)} | {total_m} | {total_s} |") lines.append("") # Detailed blocs lines.extend([ "---\n", "## Detailed Bloc Profiles\n", ]) for i, bloc in enumerate(blocs): lines.append(f"### {i + 1}. {bloc.label}") lines.append("") lines.append(f"| | |") lines.append(f"|---|---|") lines.append(f"| **Members** | {len(bloc.members)} |") lines.append(f"| **Shared Drafts** | {bloc.shared_drafts} |") lines.append(f"| **Cohesion** | {bloc.cohesion:.0%} |") lines.append(f"| **Primary Org** | {bloc.primary_org} |") # List orgs if multi-org orgs = set(org for _, _, org in bloc.members if org) if len(orgs) > 1: lines.append(f"| **All Orgs** | {', '.join(sorted(orgs))} |") lines.append("") # Member table lines.append("**Members:**\n") lines.append("| Author | Organization | Drafts | In-Bloc Drafts |") lines.append("|--------|-------------|-------:|--------------:|") for pid, name, org in bloc.members: total_d = draft_counts.get(pid, 0) my_drafts = draft_sets.get(pid, set()) # Count how many of this person's drafts have another bloc member bloc_other_pids = bloc.member_pids - {pid} in_bloc = sum( 1 for d in my_drafts if any(d in draft_sets.get(other, set()) for other in bloc_other_pids) ) lines.append(f"| {name} | {org} | {total_d} | {in_bloc} |") lines.append("") # Shared drafts list all_drafts: dict[str, int] = defaultdict(int) for pid in bloc.member_pids: for d in draft_sets.get(pid, set()): all_drafts[d] += 1 shared_list = sorted( [(d, cnt) for d, cnt in all_drafts.items() if cnt >= 2], key=lambda x: -x[1], ) if shared_list: lines.append("**Shared Drafts:**\n") lines.append("| Draft | Co-authors | Score | Categories |") lines.append("|-------|----------:|------:|------------|") for d, cnt in shared_list: r = rating_map.get(d) score = f"{r.composite_score:.1f}" if r else "—" cats = ", ".join(r.categories[:2]) if r else "" lines.append( f"| [{d}](https://datatracker.ietf.org/doc/{d}/) " f"| {cnt}/{len(bloc.members)} | {score} | {cats} |" ) lines.append("") lines.append("---\n") # Cross-bloc connections lines.extend([ "## Cross-Bloc Connections\n", "*Authors who bridge between different blocs or connect blocs to the wider community.*\n", ]) # Find authors in blocs who also collaborate with people outside their bloc coauthor_rows = db.conn.execute( """SELECT a1.name, da1.person_id, a2.name, da2.person_id, COUNT(*) as shared FROM draft_authors da1 JOIN draft_authors da2 ON da1.draft_name = da2.draft_name AND da1.person_id < da2.person_id JOIN authors a1 ON da1.person_id = a1.person_id JOIN authors a2 ON da2.person_id = a2.person_id GROUP BY da1.person_id, da2.person_id HAVING shared >= 2 ORDER BY shared DESC""" ).fetchall() # Map pid -> bloc index pid_bloc: dict[int, int] = {} for bi, b in enumerate(blocs): for pid in b.member_pids: pid_bloc[pid] = bi bridges = [] for r in coauthor_rows: _, pid_a, _, pid_b, shared = r[0], r[1], r[2], r[3], r[4] bloc_a = pid_bloc.get(pid_a) bloc_b = pid_bloc.get(pid_b) # One in a bloc, other not — or in different blocs if bloc_a is not None and bloc_b is not None and bloc_a != bloc_b: bridges.append((r[0], blocs[bloc_a].label, r[2], blocs[bloc_b].label, shared)) elif bloc_a is not None and bloc_b is None: bridges.append((r[0], blocs[bloc_a].label, r[2], "(independent)", shared)) elif bloc_b is not None and bloc_a is None: bridges.append((r[2], blocs[bloc_b].label, r[0], "(independent)", shared)) if bridges: lines.append("| Bloc Author | Bloc | External Author | Their Affiliation | Shared |") lines.append("|-------------|------|-----------------|-------------------|-------:|") for a, bloc_label, b, other_label, shared in bridges[:30]: lines.append(f"| {a} | {bloc_label} | {b} | {other_label} | {shared} |") else: lines.append("No cross-bloc connections found with >= 2 shared drafts.") lines.append("") report = "\n".join(lines) out.parent.mkdir(parents=True, exist_ok=True) out.write_text(report) db.close() print(f"Report written to {out} ({len(blocs)} blocs)") if __name__ == "__main__": main()