feat: add timing logs for dense and sparse retrieval processes and adjust top K results in sparse retriever

This commit is contained in:
Soulter
2025-10-24 17:51:30 +08:00
parent 9b004f3d2f
commit 4e9cce76da
2 changed files with 20 additions and 1 deletions

View File

@@ -3,6 +3,8 @@
协调稠密检索、稀疏检索和 Rerank,提供统一的检索接口
"""
import time
from dataclasses import dataclass
from typing import List
@@ -104,25 +106,40 @@ class RetrievalManager:
kb_ids = new_kb_ids
# 1. 稠密检索
time_start = time.time()
dense_results = await self._dense_retrieve(
query=query,
kb_ids=kb_ids,
kb_options=kb_options,
)
time_end = time.time()
logger.debug(
f"Dense retrieval across {len(kb_ids)} bases took {time_end - time_start:.2f}s and returned {len(dense_results)} results."
)
# 2. 稀疏检索
time_start = time.time()
sparse_results = await self.sparse_retriever.retrieve(
query=query,
kb_ids=kb_ids,
kb_options=kb_options,
)
time_end = time.time()
logger.debug(
f"Sparse retrieval across {len(kb_ids)} bases took {time_end - time_start:.2f}s and returned {len(sparse_results)} results."
)
# 3. 结果融合
time_start = time.time()
fused_results = await self.rank_fusion.fuse(
dense_results=dense_results,
sparse_results=sparse_results,
top_k=top_k_fusion,
)
time_end = time.time()
logger.debug(
f"Rank fusion took {time_end - time_start:.2f}s and returned {len(fused_results)} results."
)
# 4. 转换为 RetrievalResult (获取元数据)
retrieval_results = []

View File

@@ -68,6 +68,7 @@ class SparseRetriever:
List[SparseResult]: 检索结果列表
"""
# 1. 获取所有相关块
top_k_sparse = 0
chunks = []
for kb_id in kb_ids:
vec_db: FaissVecDB = kb_options.get(kb_id, {}).get("vec_db")
@@ -88,6 +89,7 @@ class SparseRetriever:
for doc, chunk_md in zip(result, chunk_mds)
]
chunks.extend(result)
top_k_sparse += kb_options.get(kb_id, {}).get("top_k_sparse", 50)
if not chunks:
return []
@@ -127,4 +129,4 @@ class SparseRetriever:
results.sort(key=lambda x: x.score, reverse=True)
# return results[: len(results) // len(kb_ids)]
return results
return results[:top_k_sparse]