From 75c4da72e0fa229d05a07feb0bac429cb471b691 Mon Sep 17 00:00:00 2001
From: Christian Nennemann <christian@nennemann.de>
Date: Sat, 7 Mar 2026 07:59:45 +0100
Subject: [PATCH] Fix broken reference links and web UI bugs

- Fix RFC URLs with leading zeros (rfc0020 -> rfc20) via int filter
- Draft refs: internal link for drafts in our DB, Datatracker for external
- BCP refs: link to rfc-editor.org/info/bcpN
- Add DB connection teardown (@app.teardown_appcontext)
- Fix JS syntax error in gap_demo.html (HTML-escaped string in script tag)
- Add URL encoding to all query params in drafts.html and draft_detail.html
- Fix variable shadowing of Flask's g import in gaps_demo()
- Add None safety for ideas search data attribute

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/webui/app.py                      | 27 ++++++--
 src/webui/data.py                     | 98 ++++++++++++++++++++++-----
 src/webui/templates/draft_detail.html | 18 ++++-
 src/webui/templates/drafts.html       | 14 ++--
 src/webui/templates/gap_demo.html     |  2 +-
 src/webui/templates/ideas.html        |  2 +-
 6 files changed, 127 insertions(+), 34 deletions(-)

diff --git a/src/webui/app.py b/src/webui/app.py
index 0c7fdef..678dcb9 100644
--- a/src/webui/app.py
+++ b/src/webui/app.py
@@ -57,6 +57,13 @@ def db():
     return g.db
 
 
+@app.teardown_appcontext
+def close_db(exception=None):
+    database = g.pop("db", None)
+    if database is not None:
+        database.close()
+
+
 # --- Routes ---
 
 
@@ -110,10 +117,20 @@ def drafts():
 
 @app.route("/drafts/<path:name>")
 def draft_detail(name: str):
-    detail = get_draft_detail(db(), name)
+    database = db()
+    detail = get_draft_detail(database, name)
     if not detail:
         abort(404)
-    return render_template("draft_detail.html", draft=detail)
+    # Build set of draft ref IDs that exist in our DB for internal linking
+    ref_draft_ids = [r["id"] for r in detail.get("refs", []) if r["type"] == "draft"]
+    known_drafts = set()
+    if ref_draft_ids:
+        placeholders = ",".join("?" * len(ref_draft_ids))
+        rows = database.conn.execute(
+            f"SELECT name FROM drafts WHERE name IN ({placeholders})", ref_draft_ids
+        ).fetchall()
+        known_drafts = {r["name"] for r in rows}
+    return render_template("draft_detail.html", draft=detail, known_drafts=known_drafts)
 
 
 @app.route("/ideas")
@@ -139,9 +156,9 @@ def gaps_demo():
     draft_info = None
     if selected:
         draft_text = read_generated_draft(selected)
-        for g in generated:
-            if g["filename"] == selected:
-                draft_info = g
+        for gd in generated:
+            if gd["filename"] == selected:
+                draft_info = gd
                 break
     elif generated:
         draft_info = generated[0]
diff --git a/src/webui/data.py b/src/webui/data.py
index 3b9f04a..8f600e5 100644
--- a/src/webui/data.py
+++ b/src/webui/data.py
@@ -551,8 +551,15 @@ def get_author_network_full(db: Database) -> dict:
 
 
 def get_idea_clusters(db: Database) -> dict:
-    """Cluster ideas by embedding similarity, return clusters + t-SNE scatter."""
+    """Cluster ideas by embedding similarity, return clusters + t-SNE scatter.
+
+    Uses Ward linkage on L2-normalized embeddings (approximates cosine) with
+    a target of ~30 clusters for readable groupings.  Enriches each cluster
+    with WG info and category breakdown.
+    """
+    import json as _json
     import numpy as np
+    from sklearn.preprocessing import normalize as sk_normalize
 
     embeddings = db.all_idea_embeddings()
     if not embeddings:
@@ -563,37 +570,51 @@ def get_idea_clusters(db: Database) -> dict:
     idea_map = {r["id"]: {"title": r["title"], "description": r["description"],
                            "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows}
 
+    # Draft -> WG and category lookup
+    draft_rows = db.conn.execute('SELECT name, "group", title FROM drafts').fetchall()
+    draft_wg = {r["name"]: r["group"] or "none" for r in draft_rows}
+    draft_title_map = {r["name"]: r["title"] for r in draft_rows}
+    rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall()
+    draft_cats: dict[str, list[str]] = {}
+    for r in rating_rows:
+        try:
+            draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
+        except (_json.JSONDecodeError, TypeError):
+            draft_cats[r["draft_name"]] = []
+
     # Build matrix from embeddings that have matching ideas
     idea_ids = [iid for iid in embeddings if iid in idea_map]
     if len(idea_ids) < 5:
         return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True}
 
     matrix = np.array([embeddings[iid] for iid in idea_ids])
+    matrix_norm = sk_normalize(matrix)
 
-    # Agglomerative clustering with cosine distance
+    # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
+    n_target = max(10, min(40, len(idea_ids) // 12))
     try:
         from sklearn.cluster import AgglomerativeClustering
-        clustering = AgglomerativeClustering(
-            n_clusters=None, distance_threshold=0.5,
-            metric='cosine', linkage='average',
-        )
-        labels = clustering.fit_predict(matrix)
+        clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
+        labels = clustering.fit_predict(matrix_norm)
     except Exception:
         return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True}
 
     # Build cluster data
-    cluster_ideas: dict[int, list] = defaultdict(list)
+    cluster_ideas_map: dict[int, list] = defaultdict(list)
     for idx, iid in enumerate(idea_ids):
-        cluster_ideas[labels[idx]].append(iid)
+        cluster_ideas_map[labels[idx]].append(iid)
 
-    # Filter to clusters with 2+ ideas
-    stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with", "on", "by", "is", "as", "at", "from", "that", "this", "it"}
+    stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with",
+            "on", "by", "is", "as", "at", "from", "that", "this", "it",
+            "based", "using", "protocol", "mechanism", "framework", "system",
+            "network", "agent", "agents"}
     clusters = []
-    for cid in sorted(cluster_ideas.keys()):
-        members = cluster_ideas[cid]
-        if len(members) < 2:
-            continue
+    for cid in sorted(cluster_ideas_map.keys()):
+        members = cluster_ideas_map[cid]
         ideas_in_cluster = [idea_map[iid] for iid in members if iid in idea_map]
+        if len(ideas_in_cluster) < 2:
+            continue
+
         # Theme: most common significant words in titles
         words = Counter()
         for idea in ideas_in_cluster:
@@ -605,29 +626,72 @@ def get_idea_clusters(db: Database) -> dict:
         theme = " ".join(top_words).title() if top_words else f"Cluster {cid}"
 
         drafts = list({idea["draft_name"] for idea in ideas_in_cluster})
+
+        # Enrich: WG breakdown
+        wg_counts: dict[str, int] = Counter()
+        cat_counts: dict[str, int] = Counter()
+        for dname in drafts:
+            wg = draft_wg.get(dname, "none")
+            wg_counts[wg] += 1
+            for cat in draft_cats.get(dname, []):
+                cat_counts[cat] += 1
+
+        wg_list = [{"wg": wg, "count": cnt} for wg, cnt in wg_counts.most_common(5)]
+        cat_list = [{"cat": cat, "count": cnt} for cat, cnt in cat_counts.most_common(3)]
+        cross_wg = len([w for w in wg_counts if w != "none"]) >= 2
+
         clusters.append({
             "id": len(clusters),
             "theme": theme,
             "size": len(ideas_in_cluster),
             "ideas": ideas_in_cluster[:20],
             "drafts": drafts,
+            "wgs": wg_list,
+            "categories": cat_list,
+            "cross_wg": cross_wg,
+            "wg_count": len(wg_counts),
         })
 
+    clusters.sort(key=lambda c: c["size"], reverse=True)
+
+    # Build mapping: original cluster label -> sorted index
+    # Each cluster remembers which original label it came from via its member ids
+    old_label_to_new: dict[int, int] = {}
+    for new_idx, c in enumerate(clusters):
+        c["id"] = new_idx
+        # Find original label for any member of this cluster
+        for old_cid, members in cluster_ideas_map.items():
+            if members and members[0] in [iid for iid in members if iid in idea_map]:
+                member_titles = {idea_map[m]["title"] for m in members if m in idea_map}
+                c_titles = {idea["title"] for idea in c["ideas"]}
+                if member_titles == c_titles or (member_titles & c_titles and len(members) == c["size"]):
+                    old_label_to_new[old_cid] = new_idx
+                    break
+
+    # Fallback: build from idea_id -> label mapping
+    iid_to_new: dict[int, int] = {}
+    for old_cid, members in cluster_ideas_map.items():
+        new_idx = old_label_to_new.get(old_cid, old_cid)
+        for iid in members:
+            iid_to_new[iid] = new_idx
+
     # t-SNE for scatter
     scatter = []
     try:
         from sklearn.manifold import TSNE
         perp = min(30, len(idea_ids) - 1)
         tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
-        coords = tsne.fit_transform(matrix)
+        coords = tsne.fit_transform(matrix_norm)
+
         for idx, iid in enumerate(idea_ids):
             info = idea_map.get(iid, {})
             scatter.append({
                 "x": round(float(coords[idx, 0]), 3),
                 "y": round(float(coords[idx, 1]), 3),
-                "cluster_id": int(labels[idx]),
+                "cluster_id": iid_to_new.get(iid, int(labels[idx])),
                 "title": info.get("title", ""),
                 "draft_name": info.get("draft_name", ""),
+                "wg": draft_wg.get(info.get("draft_name", ""), ""),
             })
     except Exception:
         pass
diff --git a/src/webui/templates/draft_detail.html b/src/webui/templates/draft_detail.html
index 5486d17..bc23be8 100644
--- a/src/webui/templates/draft_detail.html
+++ b/src/webui/templates/draft_detail.html
@@ -256,7 +256,7 @@
             </h2>
             <div class="flex flex-wrap gap-1.5">
                 {% for cat in draft.rating.categories %}
-                <a href="/drafts?cat={{ cat }}"
+                <a href="/drafts?cat={{ cat | urlencode }}"
                    class="px-2.5 py-1 rounded-full text-xs bg-slate-800/60 text-slate-400 border border-slate-700 hover:border-blue-500 hover:text-blue-400 transition">
                     {{ cat }}
                 </a>
@@ -275,16 +275,28 @@
             <div class="flex flex-wrap gap-1.5 max-h-48 overflow-y-auto">
                 {% for ref in draft.refs %}
                 {% if ref.type == 'rfc' %}
-                <a href="https://www.rfc-editor.org/rfc/{{ ref.id }}" target="_blank" rel="noopener"
+                <a href="https://www.rfc-editor.org/rfc/rfc{{ ref.id | int }}" target="_blank" rel="noopener"
                    class="px-2 py-0.5 rounded text-[10px] font-medium ref-rfc hover:opacity-80 transition">
-                    RFC {{ ref.id.replace('rfc', '') }}
+                    RFC {{ ref.id | int }}
                 </a>
                 {% elif ref.type == 'draft' %}
+                {% if ref.id in known_drafts %}
                 <a href="/drafts/{{ ref.id }}"
                    class="px-2 py-0.5 rounded text-[10px] font-medium ref-draft hover:opacity-80 transition">
                     {{ ref.id }}
                 </a>
                 {% else %}
+                <a href="https://datatracker.ietf.org/doc/{{ ref.id }}/" target="_blank" rel="noopener"
+                   class="px-2 py-0.5 rounded text-[10px] font-medium ref-draft hover:opacity-80 transition">
+                    {{ ref.id }}
+                </a>
+                {% endif %}
+                {% elif ref.type == 'bcp' %}
+                <a href="https://www.rfc-editor.org/info/bcp{{ ref.id }}" target="_blank" rel="noopener"
+                   class="px-2 py-0.5 rounded text-[10px] font-medium ref-other hover:opacity-80 transition">
+                    BCP {{ ref.id }}
+                </a>
+                {% else %}
                 <span class="px-2 py-0.5 rounded text-[10px] font-medium ref-other">
                     {{ ref.type|upper }} {{ ref.id }}
                 </span>
diff --git a/src/webui/templates/drafts.html b/src/webui/templates/drafts.html
index 04d6e14..e9a6e40 100644
--- a/src/webui/templates/drafts.html
+++ b/src/webui/templates/drafts.html
@@ -178,10 +178,10 @@
         {% if categories %}
         <div class="mt-4 pt-3 border-t border-slate-800/50">
             <div class="flex flex-wrap gap-1.5">
-                <a href="/drafts?q={{ search }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
+                <a href="/drafts?q={{ search | urlencode }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
                    class="cat-pill {% if not current_cat %}cat-pill-active{% endif %}">All</a>
                 {% for cat, count in categories.items() %}
-                <a href="/drafts?cat={{ cat }}&q={{ search }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
+                <a href="/drafts?cat={{ cat }}&q={{ search | urlencode }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
                    class="cat-pill {% if current_cat == cat %}cat-pill-active{% endif %}">
                     {{ cat }} <span class="opacity-50">{{ count }}</span>
                 </a>
@@ -326,7 +326,7 @@
 {% if result.pages > 1 %}
 <nav class="flex items-center justify-center gap-1.5 mt-6">
     {% if result.page > 1 %}
-    <a href="/drafts?page={{ result.page - 1 }}&q={{ search }}&cat={{ current_cat }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
+    <a href="/drafts?page={{ result.page - 1 }}&q={{ search | urlencode }}&cat={{ current_cat | urlencode }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
        class="page-btn page-btn-inactive">
         <svg class="w-4 h-4 inline" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M15 19l-7-7 7-7"/></svg>
         Prev
@@ -337,7 +337,7 @@
     {% set end_page = [result.pages, result.page + 2]|min %}
 
     {% if start_page > 1 %}
-    <a href="/drafts?page=1&q={{ search }}&cat={{ current_cat }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
+    <a href="/drafts?page=1&q={{ search | urlencode }}&cat={{ current_cat | urlencode }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
        class="page-btn page-btn-inactive">1</a>
     {% if start_page > 2 %}<span class="text-slate-600 px-1">...</span>{% endif %}
     {% endif %}
@@ -346,19 +346,19 @@
     {% if p == result.page %}
     <span class="page-btn page-btn-active">{{ p }}</span>
     {% else %}
-    <a href="/drafts?page={{ p }}&q={{ search }}&cat={{ current_cat }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
+    <a href="/drafts?page={{ p }}&q={{ search | urlencode }}&cat={{ current_cat | urlencode }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
        class="page-btn page-btn-inactive">{{ p }}</a>
     {% endif %}
     {% endfor %}
 
     {% if end_page < result.pages %}
     {% if end_page < result.pages - 1 %}<span class="text-slate-600 px-1">...</span>{% endif %}
-    <a href="/drafts?page={{ result.pages }}&q={{ search }}&cat={{ current_cat }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
+    <a href="/drafts?page={{ result.pages }}&q={{ search | urlencode }}&cat={{ current_cat | urlencode }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
        class="page-btn page-btn-inactive">{{ result.pages }}</a>
     {% endif %}
 
     {% if result.page < result.pages %}
-    <a href="/drafts?page={{ result.page + 1 }}&q={{ search }}&cat={{ current_cat }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
+    <a href="/drafts?page={{ result.page + 1 }}&q={{ search | urlencode }}&cat={{ current_cat | urlencode }}&min_score={{ min_score }}&sort={{ sort }}&dir={{ sort_dir }}"
        class="page-btn page-btn-inactive">
         Next
         <svg class="w-4 h-4 inline" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5l7 7-7 7"/></svg>
diff --git a/src/webui/templates/gap_demo.html b/src/webui/templates/gap_demo.html
index 05581ad..07ea3ad 100644
--- a/src/webui/templates/gap_demo.html
+++ b/src/webui/templates/gap_demo.html
@@ -101,7 +101,7 @@
 {% block extra_scripts %}
 <script>
 function downloadCurrentDraft() {
-    const text = {{ draft_text | tojson if draft_text else '""' }};
+    const text = {{ (draft_text or '') | tojson }};
     const filename = {{ (draft_info.filename if draft_info else 'draft.txt') | tojson }};
     if (!text) return;
     const blob = new Blob([text], { type: 'text/plain' });
diff --git a/src/webui/templates/ideas.html b/src/webui/templates/ideas.html
index 8c651cf..a9a921d 100644
--- a/src/webui/templates/ideas.html
+++ b/src/webui/templates/ideas.html
@@ -57,7 +57,7 @@
     <div class="divide-y divide-slate-800/50 max-h-[600px] overflow-y-auto" id="ideaList">
         {% for idea in data.ideas %}
         <div class="idea-item px-4 py-3 hover:bg-slate-800/50 transition"
-             data-search="{{ idea.title|lower }} {{ idea.description|lower }} {{ idea.draft_name|lower }}"
+             data-search="{{ (idea.title or '')|lower }} {{ (idea.description or '')|lower }} {{ (idea.draft_name or '')|lower }}"
              data-type="{{ idea.type|default('other', true)|lower }}">
             <div class="flex items-center gap-2 mb-1 flex-wrap">
                 <span class="text-sm font-medium text-slate-200">{{ idea.title }}</span>