updated tests

This commit is contained in:
2026-06-12 13:36:45 -04:00
parent d51ed42919
commit c4e8a395d2
2 changed files with 51 additions and 8 deletions
+46 -3
View File
@@ -19,6 +19,7 @@ from python.ebook_search.bm25_corpus import (
BM25CorpusUnavailableError,
BM25Manifest,
ensure_bm25_corpus,
fetch_bm25_corpus_records,
load_bm25_corpus,
)
from python.ebook_search.config import EbookSearchConfig, RerankConfig, load_config, normalize_embedding_model
@@ -33,7 +34,7 @@ from python.ebook_search.search import (
search_ebooks,
)
from python.ebook_search.timing import RuntimeStep
from python.orm.richie import EbookEmbeddingModel, EbookSource, RichieBase
from python.orm.richie import EbookChapter, EbookChunk, EbookEmbeddingModel, EbookSource, RichieBase
def test_chunk_text_uses_overlap() -> None:
@@ -86,6 +87,47 @@ def test_find_existing_source_matches_path_or_hash() -> None:
assert find_existing_source(session, Path("/new/book.epub"), "a" * 64) == source
def test_bm25_corpus_uses_existing_search_text_without_duplicate_metadata() -> None:
engine = create_engine("sqlite+pysqlite:///:memory:", future=True)
RichieBase.metadata.create_all(engine)
with sessionmaker(bind=engine, expire_on_commit=False, future=True)() as session:
source = EbookSource(
title="Book",
author="Author",
language=None,
publisher=None,
identifier=None,
file_path="/book.epub",
file_sha256="a" * 64,
file_mtime=datetime.now(tz=UTC),
file_size=10,
)
session.add(source)
session.flush()
chapter = EbookChapter(source_id=source.id, spine_index=0, title="Chapter", href=None)
session.add(chapter)
session.flush()
session.add(
EbookChunk(
id=1,
source_id=source.id,
chapter_id=chapter.id,
chunk_index=0,
text="content",
token_start=0,
token_count=1,
page_label=None,
content_sha256="b" * 64,
search_text="Book Author Chapter content",
)
)
session.commit()
records = fetch_bm25_corpus_records(session)
assert records[0]["bm25_text"] == "Book Author Chapter content"
def test_reciprocal_rank_fusion_marks_hybrid_source() -> None:
vector_results = [SearchResult(chunk_id=1, text="a", source_title="A")]
lexical_results = [SearchResult(chunk_id=2, text="b", source_title="B")]
@@ -119,7 +161,7 @@ def test_search_ebooks_runs_vector_and_bm25_in_parallel(monkeypatch) -> None:
def fake_vector_candidates(received_engine, query, _config):
"""Return vector candidates after confirming BM25 has started."""
received_engines.append(received_engine)
assert query == "parallel"
assert query == "what is parallel"
vector_started.set()
assert bm25_started.wait(timeout=2)
return [SearchResult(chunk_id=1, text="vector", source_title="Vector", vector_score=0.9)]
@@ -135,13 +177,14 @@ def test_search_ebooks_runs_vector_and_bm25_in_parallel(monkeypatch) -> None:
monkeypatch.setattr("python.ebook_search.search.bm25_candidates", fake_bm25_candidates)
config = EbookSearchConfig(rerank=RerankConfig(enabled=False))
response = search_ebooks(engine, "parallel", config)
response = search_ebooks(engine, "what is parallel", config)
timings = {step.name: step for step in response.timings}
assert [result.chunk_id for result in response.results] == [1, 2]
assert timings["Embedding + vector search"].counts_toward_total is False
assert timings["BM25 search"].counts_toward_total is False
assert timings["Hybrid retrieval"].counts_toward_total is True
assert timings["BM25 query preparation"].counts_toward_total is True
assert received_engines == [engine]