From 2229e70c737a87d12b5ac8f6417da936865e8a99 Mon Sep 17 00:00:00 2001 From: Christian Nennemann Date: Mon, 9 Mar 2026 04:07:38 +0100 Subject: [PATCH] fix: deduplicate draft revisions in search results Different revisions of the same draft (e.g. draft-foo-bar-00, -01, -02) were showing up as separate results. Now keeps only the highest-scoring revision per base draft name. Co-Authored-By: Claude Opus 4.6 --- src/ietf_analyzer/search.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/ietf_analyzer/search.py b/src/ietf_analyzer/search.py index 88faf53..4c3407b 100644 --- a/src/ietf_analyzer/search.py +++ b/src/ietf_analyzer/search.py @@ -93,21 +93,40 @@ class HybridSearch: self._embeddings_cache_time = now return self._embeddings_cache + @staticmethod + def _base_draft_name(name: str) -> str: + """Strip revision suffix to get the base draft name for dedup. + + e.g. 'draft-wang-cats-odsi-02' → 'draft-wang-cats-odsi' + """ + return re.sub(r'-\d{2,3}$', '', name) + def search(self, query: str, top_k: int = 10) -> list[dict]: """Combine FTS5 keyword search + embedding similarity search. Returns ranked list of {name, title, score, excerpt, match_type}. Falls back to FTS5-only if Ollama is unavailable. + Deduplicates draft revisions, keeping the highest-scoring version. """ - fts_results = self._fts_search(query, limit=top_k * 2) - embed_results = self._embedding_search(query, limit=top_k * 2) + fts_results = self._fts_search(query, limit=top_k * 3) + embed_results = self._embedding_search(query, limit=top_k * 3) if embed_results: merged = self._reciprocal_rank_fusion(fts_results, embed_results) else: merged = fts_results - return merged[:top_k] + # Deduplicate revisions of the same draft, keep best score + seen_bases: dict[str, int] = {} + deduped = [] + for r in merged: + base = self._base_draft_name(r["name"]) + if base not in seen_bases: + seen_bases[base] = len(deduped) + deduped.append(r) + # else: skip lower-ranked revision + + return deduped[:top_k] @staticmethod def sanitize_fts_query(query: str) -> str: