diff --git a/src/ietf_analyzer/cli.py b/src/ietf_analyzer/cli.py index 9a94532..301bb33 100644 --- a/src/ietf_analyzer/cli.py +++ b/src/ietf_analyzer/cli.py @@ -2000,21 +2000,17 @@ def revisions(org: str | None, top: int): console.print(f" Highly iterated (rev >= 05): [bold]{sum(1 for r in all_revs if r['rev_int'] >= 5)}[/]\n") # Get per-org stats using normalized org names - aff_rows = db.conn.execute( - "SELECT da.draft_name, a.affiliation FROM draft_authors da " - "JOIN authors a ON da.person_id = a.person_id " - "WHERE a.affiliation != ''" - ).fetchall() + aff_rows = db.draft_affiliation_pairs() # Map draft -> rev draft_rev = {r["name"]: r["rev_int"] for r in all_revs} # Group drafts by normalized org (deduped) org_drafts: dict[str, set[str]] = defaultdict(set) - for row in aff_rows: - norm = normalize_org(row["affiliation"]) + for draft_name, affiliation in aff_rows: + norm = normalize_org(affiliation) if norm: - org_drafts[norm].add(row["draft_name"]) + org_drafts[norm].add(draft_name) if org: # Show drafts for a specific org @@ -2275,10 +2271,8 @@ def centrality(top: int): for r in rows: G.add_edge(r[0], r[1], weight=r[2]) - persons = db.conn.execute( - "SELECT person_id, name, affiliation FROM authors" - ).fetchall() - person_info = {r[0]: (r[1], normalize_org(r[2])) for r in persons} + persons = db.all_persons_info() + person_info = {pid: (name, normalize_org(aff)) for pid, name, aff in persons} console.print(f"\n[bold]Author Network Analysis[/]: {G.number_of_nodes()} authors, {G.number_of_edges()} co-authorship edges\n") @@ -3425,14 +3419,11 @@ def _auto_heal(cfg, db, cost_limit: float, yes: bool, dry_run: bool, source_filt # Show final counts total = db.count_drafts() - rated = db.conn.execute("SELECT COUNT(*) FROM ratings").fetchone()[0] + rated = db.rated_count() embedded = db.conn.execute("SELECT COUNT(*) FROM embeddings").fetchone()[0] - idea_count = db.conn.execute("SELECT COUNT(*) FROM ideas").fetchone()[0] - gap_count = db.conn.execute("SELECT COUNT(*) FROM gaps").fetchone()[0] + idea_count = db.idea_count(include_false_positives=True) + gap_count = db.gap_count() console.print(f" Drafts: {total} | Rated: {rated} | Embedded: {embedded} | Ideas: {idea_count} | Gaps: {gap_count}") - by_source = db.conn.execute( - "SELECT source, COUNT(*) FROM drafts GROUP BY source ORDER BY COUNT(*) DESC" - ).fetchall() - source_str = " | ".join(f"{s}: {c}" for s, c in by_source) + source_str = " | ".join(f"{s}: {c}" for s, c in db.source_counts()) console.print(f" Sources: {source_str}") diff --git a/src/ietf_analyzer/db.py b/src/ietf_analyzer/db.py index 3d04800..2a51111 100644 --- a/src/ietf_analyzer/db.py +++ b/src/ietf_analyzer/db.py @@ -135,6 +135,7 @@ CREATE TABLE IF NOT EXISTS draft_refs ( ); CREATE INDEX IF NOT EXISTS idx_draft_refs_ref ON draft_refs(ref_type, ref_id); +CREATE INDEX IF NOT EXISTS idx_draft_authors_person ON draft_authors(person_id); -- Generated drafts from gap-to-draft pipeline CREATE TABLE IF NOT EXISTS generated_drafts ( @@ -303,6 +304,10 @@ class Database: if "novelty_score" not in idea_cols: self._conn.execute("ALTER TABLE ideas ADD COLUMN novelty_score INTEGER") + # Create indexes on columns that may have been added via migration + self._conn.execute("CREATE INDEX IF NOT EXISTS idx_ratings_false_positive ON ratings(false_positive)") + self._conn.execute("CREATE INDEX IF NOT EXISTS idx_drafts_source ON drafts(source)") + self._conn.commit() def close(self) -> None: @@ -927,6 +932,107 @@ class Database: "category": r["category"], "evidence": r["evidence"], "severity": r["severity"]} for r in rows] + def gap_count(self) -> int: + return self.conn.execute("SELECT COUNT(*) FROM gaps").fetchone()[0] + + def search_gaps(self, query: str, limit: int = 50) -> list[dict]: + """Search gaps by topic or description (LIKE match).""" + like = f"%{query}%" + rows = self.conn.execute( + """SELECT id, topic, description, category, severity FROM gaps + WHERE topic LIKE ? OR description LIKE ? + ORDER BY id LIMIT ?""", + (like, like, limit), + ).fetchall() + return [{"id": r["id"], "topic": r["topic"], + "description": (r["description"] or "")[:200], + "category": r["category"], "severity": r["severity"]} + for r in rows] + + # --- Shared query helpers --- + + def rated_count(self) -> int: + """Return total number of rated drafts (including false positives).""" + return self.conn.execute("SELECT COUNT(*) FROM ratings").fetchone()[0] + + def false_positive_drafts_raw(self) -> list[sqlite3.Row]: + """Return raw rows of drafts flagged as false positives, joined with ratings.""" + return self.conn.execute( + """SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, + r.summary, r.categories as r_categories, r.false_positive + FROM drafts d + JOIN ratings r ON d.name = r.draft_name + WHERE r.false_positive = 1 + ORDER BY d.name""" + ).fetchall() + + def non_false_positive_ratings_raw(self) -> list[sqlite3.Row]: + """Return raw rating rows for non-false-positive drafts.""" + return self.conn.execute( + """SELECT r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, + r.categories as r_categories + FROM ratings r + WHERE COALESCE(r.false_positive, 0) = 0""" + ).fetchall() + + def false_positive_names(self) -> set[str]: + """Return set of draft names flagged as false positives.""" + return {r[0] for r in self.conn.execute( + "SELECT draft_name FROM ratings WHERE false_positive = 1").fetchall()} + + def draft_affiliation_pairs(self) -> list[tuple[str, str]]: + """Return (draft_name, affiliation) for all draft_authors with affiliation.""" + rows = self.conn.execute( + "SELECT da.draft_name, a.affiliation FROM draft_authors da " + "JOIN authors a ON da.person_id = a.person_id " + "WHERE a.affiliation != ''" + ).fetchall() + return [(r[0], r[1]) for r in rows] + + def all_persons_info(self) -> list[tuple[int, str, str]]: + """Return (person_id, name, affiliation) for all authors.""" + rows = self.conn.execute( + "SELECT person_id, name, affiliation FROM authors" + ).fetchall() + return [(r[0], r[1], r[2]) for r in rows] + + def search_authors(self, query: str, limit: int = 50) -> list[dict]: + """Search authors by name or affiliation (LIKE match).""" + like = f"%{query}%" + rows = self.conn.execute( + """SELECT person_id, name, affiliation FROM authors + WHERE name LIKE ? OR affiliation LIKE ? + ORDER BY name LIMIT ?""", + (like, like, limit), + ).fetchall() + return [{"person_id": r["person_id"], "name": r["name"], + "affiliation": r["affiliation"] or ""} + for r in rows] + + def category_counts(self) -> dict[str, int]: + """Return {category: draft_count} from rated non-FP drafts.""" + from collections import Counter + pairs = self.drafts_with_ratings(limit=2000) + counts: Counter = Counter() + for _, rating in pairs: + for cat in rating.categories: + counts[cat] += 1 + return dict(counts.most_common()) + + def draft_author_count_map(self) -> dict[str, int]: + """Return {draft_name: author_count} for all drafts.""" + rows = self.conn.execute( + "SELECT draft_name, COUNT(*) as cnt FROM draft_authors GROUP BY draft_name" + ).fetchall() + return {r[0]: r[1] for r in rows} + + def source_counts(self) -> list[tuple[str, int]]: + """Return [(source, count)] ordered by count desc.""" + rows = self.conn.execute( + "SELECT source, COUNT(*) as cnt FROM drafts GROUP BY source ORDER BY cnt DESC" + ).fetchall() + return [(r[0], r[1]) for r in rows] + # --- Proposals --- def all_proposals(self) -> list[dict]: diff --git a/src/ietf_analyzer/reports.py b/src/ietf_analyzer/reports.py index 2a4e833..c8cc6ac 100644 --- a/src/ietf_analyzer/reports.py +++ b/src/ietf_analyzer/reports.py @@ -1403,18 +1403,14 @@ class Reporter: lines.append(f"| {label} | {cnt} | {cnt/total*100:.1f}% |") # Per-org analysis - aff_rows = self.db.conn.execute( - "SELECT da.draft_name, a.affiliation FROM draft_authors da " - "JOIN authors a ON da.person_id = a.person_id " - "WHERE a.affiliation != ''" - ).fetchall() + aff_rows = self.db.draft_affiliation_pairs() draft_rev = {r["name"]: r["rev_int"] for r in all_revs} org_drafts: dict[str, set[str]] = defaultdict(set) - for row in aff_rows: - norm = normalize_org(row["affiliation"]) + for draft_name, affiliation in aff_rows: + norm = normalize_org(affiliation) if norm: - org_drafts[norm].add(row["draft_name"]) + org_drafts[norm].add(draft_name) org_stats = [] for org_name, drafts in org_drafts.items(): @@ -1516,10 +1512,8 @@ class Reporter: for r in rows: G.add_edge(r[0], r[1], weight=r[2]) - persons = self.db.conn.execute( - "SELECT person_id, name, affiliation FROM authors" - ).fetchall() - person_info = {r[0]: (r[1], normalize_org(r[2])) for r in persons} + persons = self.db.all_persons_info() + person_info = {pid: (name, normalize_org(aff)) for pid, name, aff in persons} chinese_orgs = { "Huawei", "China Mobile", "China Telecom", "China Unicom", @@ -2239,22 +2233,12 @@ class Reporter: now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") # Get false positives - fp_rows = self.db.conn.execute( - """SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, - r.summary, r.categories as r_categories, r.false_positive - FROM drafts d - JOIN ratings r ON d.name = r.draft_name - WHERE r.false_positive = 1 - ORDER BY d.name""" - ).fetchall() + fp_rows = self.db.false_positive_drafts_raw() # Get non-FP rated drafts for comparison - nonfp_rows = self.db.conn.execute( - """SELECT r.novelty, r.maturity, r.overlap, r.momentum, r.relevance - FROM ratings r WHERE COALESCE(r.false_positive, 0) = 0""" - ).fetchall() + nonfp_rows = self.db.non_false_positive_ratings_raw() - total_rated = self.db.conn.execute("SELECT COUNT(*) FROM ratings").fetchone()[0] + total_rated = self.db.rated_count() total_drafts = self.db.count_drafts(include_false_positives=True) fp_count = len(fp_rows) @@ -2598,9 +2582,7 @@ class Reporter: WHERE r.false_positive = 0 """).fetchall() - author_counts = dict(conn.execute( - "SELECT draft_name, COUNT(*) FROM draft_authors GROUP BY draft_name" - ).fetchall()) + author_counts = self.db.draft_author_count_map() citation_counts = dict(conn.execute( "SELECT draft_name, COUNT(*) FROM draft_refs GROUP BY draft_name" ).fetchall()) diff --git a/src/webui/data.py b/src/webui/data.py index 70c2e01..6a55eea 100644 --- a/src/webui/data.py +++ b/src/webui/data.py @@ -253,12 +253,7 @@ def get_overview_stats(db: Database) -> OverviewStats: def get_category_counts(db: Database) -> dict[str, int]: """Return {category: draft_count} for all categories.""" - pairs = db.drafts_with_ratings(limit=1000) - counts: dict[str, int] = Counter() - for _, rating in pairs: - for cat in rating.categories: - counts[cat] += 1 - return dict(counts.most_common()) + return db.category_counts() def get_category_summary(db: Database, category: str) -> dict | None: @@ -1002,8 +997,7 @@ def _compute_idea_clusters(db: Database) -> dict: return {"clusters": [], "scatter": [], "stats": {"total": 0, "clustered": 0, "num_clusters": 0}, "empty": True} # Exclude ideas from false-positive drafts - fp_names = {r[0] for r in db.conn.execute( - "SELECT draft_name FROM ratings WHERE false_positive = 1").fetchall()} + fp_names = db.false_positive_names() # Fetch ideas with IDs for metadata lookup rows = db.conn.execute("SELECT id, title, description, idea_type, draft_name FROM ideas").fetchall() @@ -1512,34 +1506,10 @@ def global_search(db: Database, query: str) -> SearchResults: }) # 3. Authors via LIKE - rows = db.conn.execute( - """SELECT person_id, name, affiliation FROM authors - WHERE name LIKE ? OR affiliation LIKE ? - ORDER BY name LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["authors"].append({ - "person_id": r["person_id"], - "name": r["name"], - "affiliation": r["affiliation"] or "", - }) + results["authors"] = db.search_authors(q, limit=50) # 4. Gaps via LIKE - rows = db.conn.execute( - """SELECT id, topic, description, category, severity FROM gaps - WHERE topic LIKE ? OR description LIKE ? - ORDER BY id LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["gaps"].append({ - "id": r["id"], - "topic": r["topic"], - "description": (r["description"] or "")[:200], - "category": r["category"], - "severity": r["severity"], - }) + results["gaps"] = db.search_gaps(q, limit=50) return results @@ -2258,24 +2228,12 @@ def get_source_comparison(db: Database) -> dict: def get_false_positive_profile(db: Database) -> dict: """Profile drafts flagged as false positives.""" # Get false positives - fp_rows = db.conn.execute( - """SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, - r.summary, r.categories as r_categories, r.false_positive - FROM drafts d - JOIN ratings r ON d.name = r.draft_name - WHERE r.false_positive = 1 - ORDER BY d.name""" - ).fetchall() + fp_rows = db.false_positive_drafts_raw() # Get non-FP rated drafts for comparison - nonfp_rows = db.conn.execute( - """SELECT r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, - r.categories as r_categories - FROM ratings r - WHERE COALESCE(r.false_positive, 0) = 0""" - ).fetchall() + nonfp_rows = db.non_false_positive_ratings_raw() - total_rated = db.conn.execute("SELECT COUNT(*) FROM ratings").fetchone()[0] + total_rated = db.rated_count() total_drafts = db.count_drafts(include_false_positives=True) # Build FP list @@ -2720,34 +2678,10 @@ def global_search(db: Database, query: str) -> SearchResults: }) # 3. Authors via LIKE - rows = db.conn.execute( - """SELECT person_id, name, affiliation FROM authors - WHERE name LIKE ? OR affiliation LIKE ? - ORDER BY name LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["authors"].append({ - "person_id": r["person_id"], - "name": r["name"], - "affiliation": r["affiliation"] or "", - }) + results["authors"] = db.search_authors(q, limit=50) # 4. Gaps via LIKE - rows = db.conn.execute( - """SELECT id, topic, description, category, severity FROM gaps - WHERE topic LIKE ? OR description LIKE ? - ORDER BY id LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["gaps"].append({ - "id": r["id"], - "topic": r["topic"], - "description": (r["description"] or "")[:200], - "category": r["category"], - "severity": r["severity"], - }) + results["gaps"] = db.search_gaps(q, limit=50) return results @@ -3148,11 +3082,7 @@ def get_complexity_data(db: Database) -> dict: """).fetchall() # Author counts - author_counts = {} - for row in conn.execute(""" - SELECT draft_name, COUNT(*) AS cnt FROM draft_authors GROUP BY draft_name - """).fetchall(): - author_counts[row["draft_name"]] = row["cnt"] + author_counts = db.draft_author_count_map() # Citation counts (outgoing refs) citation_counts = {} @@ -3681,24 +3611,12 @@ def get_source_comparison(db: Database) -> dict: def get_false_positive_profile(db: Database) -> dict: """Profile drafts flagged as false positives.""" # Get false positives - fp_rows = db.conn.execute( - """SELECT d.*, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, - r.summary, r.categories as r_categories, r.false_positive - FROM drafts d - JOIN ratings r ON d.name = r.draft_name - WHERE r.false_positive = 1 - ORDER BY d.name""" - ).fetchall() + fp_rows = db.false_positive_drafts_raw() # Get non-FP rated drafts for comparison - nonfp_rows = db.conn.execute( - """SELECT r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, - r.categories as r_categories - FROM ratings r - WHERE COALESCE(r.false_positive, 0) = 0""" - ).fetchall() + nonfp_rows = db.non_false_positive_ratings_raw() - total_rated = db.conn.execute("SELECT COUNT(*) FROM ratings").fetchone()[0] + total_rated = db.rated_count() total_drafts = db.count_drafts(include_false_positives=True) # Build FP list @@ -4142,34 +4060,10 @@ def global_search(db: Database, query: str) -> SearchResults: }) # 3. Authors via LIKE - rows = db.conn.execute( - """SELECT person_id, name, affiliation FROM authors - WHERE name LIKE ? OR affiliation LIKE ? - ORDER BY name LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["authors"].append({ - "person_id": r["person_id"], - "name": r["name"], - "affiliation": r["affiliation"] or "", - }) + results["authors"] = db.search_authors(q, limit=50) # 4. Gaps via LIKE - rows = db.conn.execute( - """SELECT id, topic, description, category, severity FROM gaps - WHERE topic LIKE ? OR description LIKE ? - ORDER BY id LIMIT 50""", - (like, like), - ).fetchall() - for r in rows: - results["gaps"].append({ - "id": r["id"], - "topic": r["topic"], - "description": (r["description"] or "")[:200], - "category": r["category"], - "severity": r["severity"], - }) + results["gaps"] = db.search_gaps(q, limit=50) return results