From 94493647a60822829502ea5459c35fe22a3c3894 Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Fri, 12 Jun 2026 13:36:45 -0400 Subject: [PATCH] updated tests --- tests/test_ebook_search_core.py | 49 +++++++++++++++++++++++++++++-- tests/test_ebook_search_rerank.py | 10 +++---- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/tests/test_ebook_search_core.py b/tests/test_ebook_search_core.py index 75ba762..c4b05cc 100644 --- a/tests/test_ebook_search_core.py +++ b/tests/test_ebook_search_core.py @@ -19,6 +19,7 @@ from python.ebook_search.bm25_corpus import ( BM25CorpusUnavailableError, BM25Manifest, ensure_bm25_corpus, + fetch_bm25_corpus_records, load_bm25_corpus, ) from python.ebook_search.config import EbookSearchConfig, RerankConfig, load_config, normalize_embedding_model @@ -33,7 +34,7 @@ from python.ebook_search.search import ( search_ebooks, ) from python.ebook_search.timing import RuntimeStep -from python.orm.richie import EbookEmbeddingModel, EbookSource, RichieBase +from python.orm.richie import EbookChapter, EbookChunk, EbookEmbeddingModel, EbookSource, RichieBase def test_chunk_text_uses_overlap() -> None: @@ -86,6 +87,47 @@ def test_find_existing_source_matches_path_or_hash() -> None: assert find_existing_source(session, Path("/new/book.epub"), "a" * 64) == source +def test_bm25_corpus_uses_existing_search_text_without_duplicate_metadata() -> None: + engine = create_engine("sqlite+pysqlite:///:memory:", future=True) + RichieBase.metadata.create_all(engine) + with sessionmaker(bind=engine, expire_on_commit=False, future=True)() as session: + source = EbookSource( + title="Book", + author="Author", + language=None, + publisher=None, + identifier=None, + file_path="/book.epub", + file_sha256="a" * 64, + file_mtime=datetime.now(tz=UTC), + file_size=10, + ) + session.add(source) + session.flush() + chapter = EbookChapter(source_id=source.id, spine_index=0, title="Chapter", href=None) + session.add(chapter) + session.flush() + session.add( + EbookChunk( + id=1, + source_id=source.id, + chapter_id=chapter.id, + chunk_index=0, + text="content", + token_start=0, + token_count=1, + page_label=None, + content_sha256="b" * 64, + search_text="Book Author Chapter content", + ) + ) + session.commit() + + records = fetch_bm25_corpus_records(session) + + assert records[0]["bm25_text"] == "Book Author Chapter content" + + def test_reciprocal_rank_fusion_marks_hybrid_source() -> None: vector_results = [SearchResult(chunk_id=1, text="a", source_title="A")] lexical_results = [SearchResult(chunk_id=2, text="b", source_title="B")] @@ -119,7 +161,7 @@ def test_search_ebooks_runs_vector_and_bm25_in_parallel(monkeypatch) -> None: def fake_vector_candidates(received_engine, query, _config): """Return vector candidates after confirming BM25 has started.""" received_engines.append(received_engine) - assert query == "parallel" + assert query == "what is parallel" vector_started.set() assert bm25_started.wait(timeout=2) return [SearchResult(chunk_id=1, text="vector", source_title="Vector", vector_score=0.9)] @@ -135,13 +177,14 @@ def test_search_ebooks_runs_vector_and_bm25_in_parallel(monkeypatch) -> None: monkeypatch.setattr("python.ebook_search.search.bm25_candidates", fake_bm25_candidates) config = EbookSearchConfig(rerank=RerankConfig(enabled=False)) - response = search_ebooks(engine, "parallel", config) + response = search_ebooks(engine, "what is parallel", config) timings = {step.name: step for step in response.timings} assert [result.chunk_id for result in response.results] == [1, 2] assert timings["Embedding + vector search"].counts_toward_total is False assert timings["BM25 search"].counts_toward_total is False assert timings["Hybrid retrieval"].counts_toward_total is True + assert timings["BM25 query preparation"].counts_toward_total is True assert received_engines == [engine] diff --git a/tests/test_ebook_search_rerank.py b/tests/test_ebook_search_rerank.py index 7ccae46..db53333 100644 --- a/tests/test_ebook_search_rerank.py +++ b/tests/test_ebook_search_rerank.py @@ -75,7 +75,7 @@ def test_reranking_enabled_reorders_candidates(monkeypatch: pytest.MonkeyPatch) results = rerank_chunks("query", candidates(), RerankConfig()) assert [result.chunk_id for result in results] == [2, 1, 3] - assert [round(result.score, 3) for result in results] == [0.45, 0.1, 0.0] + assert [round(result.score, 3) for result in results] == [0.78, 0.37, 0.28] assert [result.rerank_score for result in results] == [0.9, 0.1, 0.4] @@ -100,8 +100,8 @@ def test_reranking_cannot_ignore_hybrid_score(monkeypatch: pytest.MonkeyPatch) - results = rerank_chunks("query", candidates, RerankConfig()) assert [result.chunk_id for result in results] == [1, 2] - assert results[0].score == 0.7 - assert results[1].score == 0.0 + assert results[0].score == pytest.approx(0.79) + assert results[1].score == 0.7 assert results[1].rerank_score == 1.0 @@ -129,7 +129,7 @@ def test_malformed_vllm_rerank_json_does_not_crash_search(monkeypatch: pytest.Mo results = rerank_chunks("query", candidates()[:1], RerankConfig()) - assert results[0].score == 0.0 + assert results[0].score == 0.3 def test_vllm_rerank_scores_are_clamped(monkeypatch: pytest.MonkeyPatch) -> None: @@ -147,4 +147,4 @@ def test_vllm_rerank_scores_are_clamped(monkeypatch: pytest.MonkeyPatch) -> None results = rerank_chunks("query", candidates()[:2], RerankConfig()) - assert [result.rerank_score for result in results] == [0.0, 1.0] + assert {result.chunk_id: result.rerank_score for result in results} == {1: 0.0, 2: 1.0}