fix(ebook-search): code clean up to impove reliablty and readabilty

This commit is contained in:
2026-06-18 12:45:56 -04:00
parent 6ae1ff1f5c
commit 6bc30115d9
2 changed files with 11 additions and 14 deletions
+1 -1
View File
@@ -42,7 +42,7 @@ class RerankConfig(BaseSettings):
model_config = SettingsConfigDict(env_prefix="EBOOK_SEARCH_RERANK_", frozen=True, protected_namespaces=()) model_config = SettingsConfigDict(env_prefix="EBOOK_SEARCH_RERANK_", frozen=True, protected_namespaces=())
enabled: bool = False enabled: bool = True
base_url: str = "http://192.168.90.25:8001" base_url: str = "http://192.168.90.25:8001"
model: str = "qwen3-reranker-06b" model: str = "qwen3-reranker-06b"
candidates: int = 24 candidates: int = 24
+10 -13
View File
@@ -4,6 +4,7 @@ from __future__ import annotations
import logging import logging
import re import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, replace from dataclasses import dataclass, replace
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@@ -93,14 +94,11 @@ def search_ebooks(
logger.info("ebook_search_start query_length=%s rerank=%s", len(query), rerank) logger.info("ebook_search_start query_length=%s rerank=%s", len(query), rerank)
timings: list[RuntimeStep] = [] timings: list[RuntimeStep] = []
bm25_query, timing = timed_result("BM25 query preparation", retrieval_query_from_text, query)
timings.append(timing)
retrieval, timing = timed_result( retrieval, timing = timed_result(
"Hybrid retrieval", "Hybrid retrieval",
parallel_retrieval, parallel_retrieval,
engine, engine,
query, query,
bm25_query,
config, config,
) )
timings.extend(retrieval.timings) timings.extend(retrieval.timings)
@@ -134,8 +132,7 @@ def search_ebooks(
def parallel_retrieval( def parallel_retrieval(
engine: Engine, engine: Engine,
vector_query: str, query: str,
bm25_query: str,
config: EbookSearchConfig, config: EbookSearchConfig,
) -> RetrievalResponse: ) -> RetrievalResponse:
"""Run vector and BM25 candidate retrieval concurrently with separate database sessions.""" """Run vector and BM25 candidate retrieval concurrently with separate database sessions."""
@@ -145,14 +142,14 @@ def parallel_retrieval(
"Embedding + vector search", "Embedding + vector search",
vector_candidates, vector_candidates,
engine, engine,
vector_query, query,
config, config,
) )
bm25_future = executor.submit( bm25_future = executor.submit(
timed_result, timed_result,
"BM25 search", "BM25 search",
bm25_candidates, bm25_candidates,
bm25_query, query,
config, config,
) )
vector_results, vector_timing = vector_future.result() vector_results, vector_timing = vector_future.result()
@@ -263,7 +260,8 @@ def bm25_candidates(query: str, config: EbookSearchConfig) -> list[SearchResult]
logger.info("ebook_bm25_search_complete corpus=0 candidates=0") logger.info("ebook_bm25_search_complete corpus=0 candidates=0")
return [] return []
scored_records = score_bm25_corpus(query, corpus, limit=config.bm25_candidate_limit) bm25_query = retrieval_query_from_text(query)
scored_records = score_bm25_corpus(bm25_query, corpus, limit=config.bm25_candidate_limit)
results = [ results = [
replace(search_result_from_row(record), score=score, vector_score=None, bm25_score=score) replace(search_result_from_row(record), score=score, vector_score=None, bm25_score=score)
for record, score in scored_records for record, score in scored_records
@@ -282,24 +280,23 @@ def bm25_candidates(query: str, config: EbookSearchConfig) -> list[SearchResult]
def reciprocal_rank_fusion( def reciprocal_rank_fusion(
vector_results: list[SearchResult], vector_results: list[SearchResult],
lexical_results: list[SearchResult], lexical_results: list[SearchResult],
*, rank_constant: int,
rank_constant: int = 60,
) -> list[SearchResult]: ) -> list[SearchResult]:
"""Fuse vector and lexical rankings with Reciprocal Rank Fusion.""" """Fuse vector and lexical rankings with Reciprocal Rank Fusion."""
by_chunk: dict[int, SearchResult] = {} by_chunk: dict[int, SearchResult] = {}
scores: dict[int, float] = {} scores: defaultdict[int, float] = defaultdict(float)
vector_scores: dict[int, float] = {} vector_scores: dict[int, float] = {}
bm25_scores: dict[int, float] = {} bm25_scores: dict[int, float] = {}
for rank, result in enumerate(vector_results, start=1): for rank, result in enumerate(vector_results, start=1):
by_chunk.setdefault(result.chunk_id, result) by_chunk.setdefault(result.chunk_id, result)
vector_scores[result.chunk_id] = result.vector_score if result.vector_score is not None else result.score vector_scores[result.chunk_id] = result.vector_score if result.vector_score is not None else result.score
scores[result.chunk_id] = scores.get(result.chunk_id, 0.0) + (1 / (rank_constant + rank)) scores[result.chunk_id] += 1 / (rank_constant + rank)
for rank, result in enumerate(lexical_results, start=1): for rank, result in enumerate(lexical_results, start=1):
by_chunk.setdefault(result.chunk_id, result) by_chunk.setdefault(result.chunk_id, result)
bm25_scores[result.chunk_id] = result.bm25_score if result.bm25_score is not None else result.score bm25_scores[result.chunk_id] = result.bm25_score if result.bm25_score is not None else result.score
scores[result.chunk_id] = scores.get(result.chunk_id, 0.0) + (1 / (rank_constant + rank)) scores[result.chunk_id] += 1 / (rank_constant + rank)
return sorted( return sorted(
( (