fix: deduplicate draft revisions in search results
Different revisions of the same draft (e.g. draft-foo-bar-00, -01, -02) were showing up as separate results. Now keeps only the highest-scoring revision per base draft name. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -93,21 +93,40 @@ class HybridSearch:
|
||||
self._embeddings_cache_time = now
|
||||
return self._embeddings_cache
|
||||
|
||||
@staticmethod
|
||||
def _base_draft_name(name: str) -> str:
|
||||
"""Strip revision suffix to get the base draft name for dedup.
|
||||
|
||||
e.g. 'draft-wang-cats-odsi-02' → 'draft-wang-cats-odsi'
|
||||
"""
|
||||
return re.sub(r'-\d{2,3}$', '', name)
|
||||
|
||||
def search(self, query: str, top_k: int = 10) -> list[dict]:
|
||||
"""Combine FTS5 keyword search + embedding similarity search.
|
||||
|
||||
Returns ranked list of {name, title, score, excerpt, match_type}.
|
||||
Falls back to FTS5-only if Ollama is unavailable.
|
||||
Deduplicates draft revisions, keeping the highest-scoring version.
|
||||
"""
|
||||
fts_results = self._fts_search(query, limit=top_k * 2)
|
||||
embed_results = self._embedding_search(query, limit=top_k * 2)
|
||||
fts_results = self._fts_search(query, limit=top_k * 3)
|
||||
embed_results = self._embedding_search(query, limit=top_k * 3)
|
||||
|
||||
if embed_results:
|
||||
merged = self._reciprocal_rank_fusion(fts_results, embed_results)
|
||||
else:
|
||||
merged = fts_results
|
||||
|
||||
return merged[:top_k]
|
||||
# Deduplicate revisions of the same draft, keep best score
|
||||
seen_bases: dict[str, int] = {}
|
||||
deduped = []
|
||||
for r in merged:
|
||||
base = self._base_draft_name(r["name"])
|
||||
if base not in seen_bases:
|
||||
seen_bases[base] = len(deduped)
|
||||
deduped.append(r)
|
||||
# else: skip lower-ranked revision
|
||||
|
||||
return deduped[:top_k]
|
||||
|
||||
@staticmethod
|
||||
def sanitize_fts_query(query: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user