From 75c4da72e0fa229d05a07feb0bac429cb471b691 Mon Sep 17 00:00:00 2001 From: Christian Nennemann Date: Sat, 7 Mar 2026 07:59:45 +0100 Subject: [PATCH] Fix broken reference links and web UI bugs - Fix RFC URLs with leading zeros (rfc0020 -> rfc20) via int filter - Draft refs: internal link for drafts in our DB, Datatracker for external - BCP refs: link to rfc-editor.org/info/bcpN - Add DB connection teardown (@app.teardown_appcontext) - Fix JS syntax error in gap_demo.html (HTML-escaped string in script tag) - Add URL encoding to all query params in drafts.html and draft_detail.html - Fix variable shadowing of Flask's g import in gaps_demo() - Add None safety for ideas search data attribute Co-Authored-By: Claude Opus 4.6 --- src/webui/app.py | 27 ++++++-- src/webui/data.py | 98 ++++++++++++++++++++++----- src/webui/templates/draft_detail.html | 18 ++++- src/webui/templates/drafts.html | 14 ++-- src/webui/templates/gap_demo.html | 2 +- src/webui/templates/ideas.html | 2 +- 6 files changed, 127 insertions(+), 34 deletions(-) diff --git a/src/webui/app.py b/src/webui/app.py index 0c7fdef..678dcb9 100644 --- a/src/webui/app.py +++ b/src/webui/app.py @@ -57,6 +57,13 @@ def db(): return g.db +@app.teardown_appcontext +def close_db(exception=None): + database = g.pop("db", None) + if database is not None: + database.close() + + # --- Routes --- @@ -110,10 +117,20 @@ def drafts(): @app.route("/drafts/") def draft_detail(name: str): - detail = get_draft_detail(db(), name) + database = db() + detail = get_draft_detail(database, name) if not detail: abort(404) - return render_template("draft_detail.html", draft=detail) + # Build set of draft ref IDs that exist in our DB for internal linking + ref_draft_ids = [r["id"] for r in detail.get("refs", []) if r["type"] == "draft"] + known_drafts = set() + if ref_draft_ids: + placeholders = ",".join("?" * len(ref_draft_ids)) + rows = database.conn.execute( + f"SELECT name FROM drafts WHERE name IN ({placeholders})", ref_draft_ids + ).fetchall() + known_drafts = {r["name"] for r in rows} + return render_template("draft_detail.html", draft=detail, known_drafts=known_drafts) @app.route("/ideas") @@ -139,9 +156,9 @@ def gaps_demo(): draft_info = None if selected: draft_text = read_generated_draft(selected) - for g in generated: - if g["filename"] == selected: - draft_info = g + for gd in generated: + if gd["filename"] == selected: + draft_info = gd break elif generated: draft_info = generated[0] diff --git a/src/webui/data.py b/src/webui/data.py index 3b9f04a..8f600e5 100644 --- a/src/webui/data.py +++ b/src/webui/data.py @@ -551,8 +551,15 @@ def get_author_network_full(db: Database) -> dict: def get_idea_clusters(db: Database) -> dict: - """Cluster ideas by embedding similarity, return clusters + t-SNE scatter.""" + """Cluster ideas by embedding similarity, return clusters + t-SNE scatter. + + Uses Ward linkage on L2-normalized embeddings (approximates cosine) with + a target of ~30 clusters for readable groupings. Enriches each cluster + with WG info and category breakdown. + """ + import json as _json import numpy as np + from sklearn.preprocessing import normalize as sk_normalize embeddings = db.all_idea_embeddings() if not embeddings: @@ -563,37 +570,51 @@ def get_idea_clusters(db: Database) -> dict: idea_map = {r["id"]: {"title": r["title"], "description": r["description"], "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows} + # Draft -> WG and category lookup + draft_rows = db.conn.execute('SELECT name, "group", title FROM drafts').fetchall() + draft_wg = {r["name"]: r["group"] or "none" for r in draft_rows} + draft_title_map = {r["name"]: r["title"] for r in draft_rows} + rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() + draft_cats: dict[str, list[str]] = {} + for r in rating_rows: + try: + draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else [] + except (_json.JSONDecodeError, TypeError): + draft_cats[r["draft_name"]] = [] + # Build matrix from embeddings that have matching ideas idea_ids = [iid for iid in embeddings if iid in idea_map] if len(idea_ids) < 5: return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} matrix = np.array([embeddings[iid] for iid in idea_ids]) + matrix_norm = sk_normalize(matrix) - # Agglomerative clustering with cosine distance + # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size + n_target = max(10, min(40, len(idea_ids) // 12)) try: from sklearn.cluster import AgglomerativeClustering - clustering = AgglomerativeClustering( - n_clusters=None, distance_threshold=0.5, - metric='cosine', linkage='average', - ) - labels = clustering.fit_predict(matrix) + clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward') + labels = clustering.fit_predict(matrix_norm) except Exception: return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} # Build cluster data - cluster_ideas: dict[int, list] = defaultdict(list) + cluster_ideas_map: dict[int, list] = defaultdict(list) for idx, iid in enumerate(idea_ids): - cluster_ideas[labels[idx]].append(iid) + cluster_ideas_map[labels[idx]].append(iid) - # Filter to clusters with 2+ ideas - stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with", "on", "by", "is", "as", "at", "from", "that", "this", "it"} + stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with", + "on", "by", "is", "as", "at", "from", "that", "this", "it", + "based", "using", "protocol", "mechanism", "framework", "system", + "network", "agent", "agents"} clusters = [] - for cid in sorted(cluster_ideas.keys()): - members = cluster_ideas[cid] - if len(members) < 2: - continue + for cid in sorted(cluster_ideas_map.keys()): + members = cluster_ideas_map[cid] ideas_in_cluster = [idea_map[iid] for iid in members if iid in idea_map] + if len(ideas_in_cluster) < 2: + continue + # Theme: most common significant words in titles words = Counter() for idea in ideas_in_cluster: @@ -605,29 +626,72 @@ def get_idea_clusters(db: Database) -> dict: theme = " ".join(top_words).title() if top_words else f"Cluster {cid}" drafts = list({idea["draft_name"] for idea in ideas_in_cluster}) + + # Enrich: WG breakdown + wg_counts: dict[str, int] = Counter() + cat_counts: dict[str, int] = Counter() + for dname in drafts: + wg = draft_wg.get(dname, "none") + wg_counts[wg] += 1 + for cat in draft_cats.get(dname, []): + cat_counts[cat] += 1 + + wg_list = [{"wg": wg, "count": cnt} for wg, cnt in wg_counts.most_common(5)] + cat_list = [{"cat": cat, "count": cnt} for cat, cnt in cat_counts.most_common(3)] + cross_wg = len([w for w in wg_counts if w != "none"]) >= 2 + clusters.append({ "id": len(clusters), "theme": theme, "size": len(ideas_in_cluster), "ideas": ideas_in_cluster[:20], "drafts": drafts, + "wgs": wg_list, + "categories": cat_list, + "cross_wg": cross_wg, + "wg_count": len(wg_counts), }) + clusters.sort(key=lambda c: c["size"], reverse=True) + + # Build mapping: original cluster label -> sorted index + # Each cluster remembers which original label it came from via its member ids + old_label_to_new: dict[int, int] = {} + for new_idx, c in enumerate(clusters): + c["id"] = new_idx + # Find original label for any member of this cluster + for old_cid, members in cluster_ideas_map.items(): + if members and members[0] in [iid for iid in members if iid in idea_map]: + member_titles = {idea_map[m]["title"] for m in members if m in idea_map} + c_titles = {idea["title"] for idea in c["ideas"]} + if member_titles == c_titles or (member_titles & c_titles and len(members) == c["size"]): + old_label_to_new[old_cid] = new_idx + break + + # Fallback: build from idea_id -> label mapping + iid_to_new: dict[int, int] = {} + for old_cid, members in cluster_ideas_map.items(): + new_idx = old_label_to_new.get(old_cid, old_cid) + for iid in members: + iid_to_new[iid] = new_idx + # t-SNE for scatter scatter = [] try: from sklearn.manifold import TSNE perp = min(30, len(idea_ids) - 1) tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500) - coords = tsne.fit_transform(matrix) + coords = tsne.fit_transform(matrix_norm) + for idx, iid in enumerate(idea_ids): info = idea_map.get(iid, {}) scatter.append({ "x": round(float(coords[idx, 0]), 3), "y": round(float(coords[idx, 1]), 3), - "cluster_id": int(labels[idx]), + "cluster_id": iid_to_new.get(iid, int(labels[idx])), "title": info.get("title", ""), "draft_name": info.get("draft_name", ""), + "wg": draft_wg.get(info.get("draft_name", ""), ""), }) except Exception: pass diff --git a/src/webui/templates/draft_detail.html b/src/webui/templates/draft_detail.html index 5486d17..bc23be8 100644 --- a/src/webui/templates/draft_detail.html +++ b/src/webui/templates/draft_detail.html @@ -256,7 +256,7 @@
{% for cat in draft.rating.categories %} - {{ cat }} @@ -275,16 +275,28 @@
{% for ref in draft.refs %} {% if ref.type == 'rfc' %} - - RFC {{ ref.id.replace('rfc', '') }} + RFC {{ ref.id | int }} {% elif ref.type == 'draft' %} + {% if ref.id in known_drafts %} {{ ref.id }} {% else %} + + {{ ref.id }} + + {% endif %} + {% elif ref.type == 'bcp' %} + + BCP {{ ref.id }} + + {% else %} {{ ref.type|upper }} {{ ref.id }} diff --git a/src/webui/templates/drafts.html b/src/webui/templates/drafts.html index 04d6e14..e9a6e40 100644 --- a/src/webui/templates/drafts.html +++ b/src/webui/templates/drafts.html @@ -178,10 +178,10 @@ {% if categories %}
- All {% for cat, count in categories.items() %} - {{ cat }} {{ count }} @@ -326,7 +326,7 @@ {% if result.pages > 1 %}