Fix broken reference links and web UI bugs
- Fix RFC URLs with leading zeros (rfc0020 -> rfc20) via int filter - Draft refs: internal link for drafts in our DB, Datatracker for external - BCP refs: link to rfc-editor.org/info/bcpN - Add DB connection teardown (@app.teardown_appcontext) - Fix JS syntax error in gap_demo.html (HTML-escaped string in script tag) - Add URL encoding to all query params in drafts.html and draft_detail.html - Fix variable shadowing of Flask's g import in gaps_demo() - Add None safety for ideas search data attribute Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -551,8 +551,15 @@ def get_author_network_full(db: Database) -> dict:
|
||||
|
||||
|
||||
def get_idea_clusters(db: Database) -> dict:
|
||||
"""Cluster ideas by embedding similarity, return clusters + t-SNE scatter."""
|
||||
"""Cluster ideas by embedding similarity, return clusters + t-SNE scatter.
|
||||
|
||||
Uses Ward linkage on L2-normalized embeddings (approximates cosine) with
|
||||
a target of ~30 clusters for readable groupings. Enriches each cluster
|
||||
with WG info and category breakdown.
|
||||
"""
|
||||
import json as _json
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import normalize as sk_normalize
|
||||
|
||||
embeddings = db.all_idea_embeddings()
|
||||
if not embeddings:
|
||||
@@ -563,37 +570,51 @@ def get_idea_clusters(db: Database) -> dict:
|
||||
idea_map = {r["id"]: {"title": r["title"], "description": r["description"],
|
||||
"type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows}
|
||||
|
||||
# Draft -> WG and category lookup
|
||||
draft_rows = db.conn.execute('SELECT name, "group", title FROM drafts').fetchall()
|
||||
draft_wg = {r["name"]: r["group"] or "none" for r in draft_rows}
|
||||
draft_title_map = {r["name"]: r["title"] for r in draft_rows}
|
||||
rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall()
|
||||
draft_cats: dict[str, list[str]] = {}
|
||||
for r in rating_rows:
|
||||
try:
|
||||
draft_cats[r["draft_name"]] = _json.loads(r["categories"]) if r["categories"] else []
|
||||
except (_json.JSONDecodeError, TypeError):
|
||||
draft_cats[r["draft_name"]] = []
|
||||
|
||||
# Build matrix from embeddings that have matching ideas
|
||||
idea_ids = [iid for iid in embeddings if iid in idea_map]
|
||||
if len(idea_ids) < 5:
|
||||
return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True}
|
||||
|
||||
matrix = np.array([embeddings[iid] for iid in idea_ids])
|
||||
matrix_norm = sk_normalize(matrix)
|
||||
|
||||
# Agglomerative clustering with cosine distance
|
||||
# Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size
|
||||
n_target = max(10, min(40, len(idea_ids) // 12))
|
||||
try:
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=0.5,
|
||||
metric='cosine', linkage='average',
|
||||
)
|
||||
labels = clustering.fit_predict(matrix)
|
||||
clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward')
|
||||
labels = clustering.fit_predict(matrix_norm)
|
||||
except Exception:
|
||||
return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True}
|
||||
|
||||
# Build cluster data
|
||||
cluster_ideas: dict[int, list] = defaultdict(list)
|
||||
cluster_ideas_map: dict[int, list] = defaultdict(list)
|
||||
for idx, iid in enumerate(idea_ids):
|
||||
cluster_ideas[labels[idx]].append(iid)
|
||||
cluster_ideas_map[labels[idx]].append(iid)
|
||||
|
||||
# Filter to clusters with 2+ ideas
|
||||
stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with", "on", "by", "is", "as", "at", "from", "that", "this", "it"}
|
||||
stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with",
|
||||
"on", "by", "is", "as", "at", "from", "that", "this", "it",
|
||||
"based", "using", "protocol", "mechanism", "framework", "system",
|
||||
"network", "agent", "agents"}
|
||||
clusters = []
|
||||
for cid in sorted(cluster_ideas.keys()):
|
||||
members = cluster_ideas[cid]
|
||||
if len(members) < 2:
|
||||
continue
|
||||
for cid in sorted(cluster_ideas_map.keys()):
|
||||
members = cluster_ideas_map[cid]
|
||||
ideas_in_cluster = [idea_map[iid] for iid in members if iid in idea_map]
|
||||
if len(ideas_in_cluster) < 2:
|
||||
continue
|
||||
|
||||
# Theme: most common significant words in titles
|
||||
words = Counter()
|
||||
for idea in ideas_in_cluster:
|
||||
@@ -605,29 +626,72 @@ def get_idea_clusters(db: Database) -> dict:
|
||||
theme = " ".join(top_words).title() if top_words else f"Cluster {cid}"
|
||||
|
||||
drafts = list({idea["draft_name"] for idea in ideas_in_cluster})
|
||||
|
||||
# Enrich: WG breakdown
|
||||
wg_counts: dict[str, int] = Counter()
|
||||
cat_counts: dict[str, int] = Counter()
|
||||
for dname in drafts:
|
||||
wg = draft_wg.get(dname, "none")
|
||||
wg_counts[wg] += 1
|
||||
for cat in draft_cats.get(dname, []):
|
||||
cat_counts[cat] += 1
|
||||
|
||||
wg_list = [{"wg": wg, "count": cnt} for wg, cnt in wg_counts.most_common(5)]
|
||||
cat_list = [{"cat": cat, "count": cnt} for cat, cnt in cat_counts.most_common(3)]
|
||||
cross_wg = len([w for w in wg_counts if w != "none"]) >= 2
|
||||
|
||||
clusters.append({
|
||||
"id": len(clusters),
|
||||
"theme": theme,
|
||||
"size": len(ideas_in_cluster),
|
||||
"ideas": ideas_in_cluster[:20],
|
||||
"drafts": drafts,
|
||||
"wgs": wg_list,
|
||||
"categories": cat_list,
|
||||
"cross_wg": cross_wg,
|
||||
"wg_count": len(wg_counts),
|
||||
})
|
||||
|
||||
clusters.sort(key=lambda c: c["size"], reverse=True)
|
||||
|
||||
# Build mapping: original cluster label -> sorted index
|
||||
# Each cluster remembers which original label it came from via its member ids
|
||||
old_label_to_new: dict[int, int] = {}
|
||||
for new_idx, c in enumerate(clusters):
|
||||
c["id"] = new_idx
|
||||
# Find original label for any member of this cluster
|
||||
for old_cid, members in cluster_ideas_map.items():
|
||||
if members and members[0] in [iid for iid in members if iid in idea_map]:
|
||||
member_titles = {idea_map[m]["title"] for m in members if m in idea_map}
|
||||
c_titles = {idea["title"] for idea in c["ideas"]}
|
||||
if member_titles == c_titles or (member_titles & c_titles and len(members) == c["size"]):
|
||||
old_label_to_new[old_cid] = new_idx
|
||||
break
|
||||
|
||||
# Fallback: build from idea_id -> label mapping
|
||||
iid_to_new: dict[int, int] = {}
|
||||
for old_cid, members in cluster_ideas_map.items():
|
||||
new_idx = old_label_to_new.get(old_cid, old_cid)
|
||||
for iid in members:
|
||||
iid_to_new[iid] = new_idx
|
||||
|
||||
# t-SNE for scatter
|
||||
scatter = []
|
||||
try:
|
||||
from sklearn.manifold import TSNE
|
||||
perp = min(30, len(idea_ids) - 1)
|
||||
tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500)
|
||||
coords = tsne.fit_transform(matrix)
|
||||
coords = tsne.fit_transform(matrix_norm)
|
||||
|
||||
for idx, iid in enumerate(idea_ids):
|
||||
info = idea_map.get(iid, {})
|
||||
scatter.append({
|
||||
"x": round(float(coords[idx, 0]), 3),
|
||||
"y": round(float(coords[idx, 1]), 3),
|
||||
"cluster_id": int(labels[idx]),
|
||||
"cluster_id": iid_to_new.get(iid, int(labels[idx])),
|
||||
"title": info.get("title", ""),
|
||||
"draft_name": info.get("draft_name", ""),
|
||||
"wg": draft_wg.get(info.get("draft_name", ""), ""),
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user