fix(ebook-search): code clean up to impove reliablty and readabilty

This commit is contained in:
2026-06-18 12:45:56 -04:00
parent 6ae1ff1f5c
commit 6bc30115d9
2 changed files with 11 additions and 14 deletions
+1 -1
View File
@@ -42,7 +42,7 @@ class RerankConfig(BaseSettings):
model_config = SettingsConfigDict(env_prefix="EBOOK_SEARCH_RERANK_", frozen=True, protected_namespaces=())
enabled: bool = False
enabled: bool = True
base_url: str = "http://192.168.90.25:8001"
model: str = "qwen3-reranker-06b"
candidates: int = 24
+10 -13
View File
@@ -4,6 +4,7 @@ from __future__ import annotations
import logging
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, replace
from typing import TYPE_CHECKING
@@ -93,14 +94,11 @@ def search_ebooks(
logger.info("ebook_search_start query_length=%s rerank=%s", len(query), rerank)
timings: list[RuntimeStep] = []
bm25_query, timing = timed_result("BM25 query preparation", retrieval_query_from_text, query)
timings.append(timing)
retrieval, timing = timed_result(
"Hybrid retrieval",
parallel_retrieval,
engine,
query,
bm25_query,
config,
)
timings.extend(retrieval.timings)
@@ -134,8 +132,7 @@ def search_ebooks(
def parallel_retrieval(
engine: Engine,
vector_query: str,
bm25_query: str,
query: str,
config: EbookSearchConfig,
) -> RetrievalResponse:
"""Run vector and BM25 candidate retrieval concurrently with separate database sessions."""
@@ -145,14 +142,14 @@ def parallel_retrieval(
"Embedding + vector search",
vector_candidates,
engine,
vector_query,
query,
config,
)
bm25_future = executor.submit(
timed_result,
"BM25 search",
bm25_candidates,
bm25_query,
query,
config,
)
vector_results, vector_timing = vector_future.result()
@@ -263,7 +260,8 @@ def bm25_candidates(query: str, config: EbookSearchConfig) -> list[SearchResult]
logger.info("ebook_bm25_search_complete corpus=0 candidates=0")
return []
scored_records = score_bm25_corpus(query, corpus, limit=config.bm25_candidate_limit)
bm25_query = retrieval_query_from_text(query)
scored_records = score_bm25_corpus(bm25_query, corpus, limit=config.bm25_candidate_limit)
results = [
replace(search_result_from_row(record), score=score, vector_score=None, bm25_score=score)
for record, score in scored_records
@@ -282,24 +280,23 @@ def bm25_candidates(query: str, config: EbookSearchConfig) -> list[SearchResult]
def reciprocal_rank_fusion(
vector_results: list[SearchResult],
lexical_results: list[SearchResult],
*,
rank_constant: int = 60,
rank_constant: int,
) -> list[SearchResult]:
"""Fuse vector and lexical rankings with Reciprocal Rank Fusion."""
by_chunk: dict[int, SearchResult] = {}
scores: dict[int, float] = {}
scores: defaultdict[int, float] = defaultdict(float)
vector_scores: dict[int, float] = {}
bm25_scores: dict[int, float] = {}
for rank, result in enumerate(vector_results, start=1):
by_chunk.setdefault(result.chunk_id, result)
vector_scores[result.chunk_id] = result.vector_score if result.vector_score is not None else result.score
scores[result.chunk_id] = scores.get(result.chunk_id, 0.0) + (1 / (rank_constant + rank))
scores[result.chunk_id] += 1 / (rank_constant + rank)
for rank, result in enumerate(lexical_results, start=1):
by_chunk.setdefault(result.chunk_id, result)
bm25_scores[result.chunk_id] = result.bm25_score if result.bm25_score is not None else result.score
scores[result.chunk_id] = scores.get(result.chunk_id, 0.0) + (1 / (rank_constant + rank))
scores[result.chunk_id] += 1 / (rank_constant + rank)
return sorted(
(