updated tests
This commit is contained in:
@@ -19,6 +19,7 @@ from python.ebook_search.bm25_corpus import (
|
|||||||
BM25CorpusUnavailableError,
|
BM25CorpusUnavailableError,
|
||||||
BM25Manifest,
|
BM25Manifest,
|
||||||
ensure_bm25_corpus,
|
ensure_bm25_corpus,
|
||||||
|
fetch_bm25_corpus_records,
|
||||||
load_bm25_corpus,
|
load_bm25_corpus,
|
||||||
)
|
)
|
||||||
from python.ebook_search.config import EbookSearchConfig, RerankConfig, load_config, normalize_embedding_model
|
from python.ebook_search.config import EbookSearchConfig, RerankConfig, load_config, normalize_embedding_model
|
||||||
@@ -33,7 +34,7 @@ from python.ebook_search.search import (
|
|||||||
search_ebooks,
|
search_ebooks,
|
||||||
)
|
)
|
||||||
from python.ebook_search.timing import RuntimeStep
|
from python.ebook_search.timing import RuntimeStep
|
||||||
from python.orm.richie import EbookEmbeddingModel, EbookSource, RichieBase
|
from python.orm.richie import EbookChapter, EbookChunk, EbookEmbeddingModel, EbookSource, RichieBase
|
||||||
|
|
||||||
|
|
||||||
def test_chunk_text_uses_overlap() -> None:
|
def test_chunk_text_uses_overlap() -> None:
|
||||||
@@ -86,6 +87,47 @@ def test_find_existing_source_matches_path_or_hash() -> None:
|
|||||||
assert find_existing_source(session, Path("/new/book.epub"), "a" * 64) == source
|
assert find_existing_source(session, Path("/new/book.epub"), "a" * 64) == source
|
||||||
|
|
||||||
|
|
||||||
|
def test_bm25_corpus_uses_existing_search_text_without_duplicate_metadata() -> None:
|
||||||
|
engine = create_engine("sqlite+pysqlite:///:memory:", future=True)
|
||||||
|
RichieBase.metadata.create_all(engine)
|
||||||
|
with sessionmaker(bind=engine, expire_on_commit=False, future=True)() as session:
|
||||||
|
source = EbookSource(
|
||||||
|
title="Book",
|
||||||
|
author="Author",
|
||||||
|
language=None,
|
||||||
|
publisher=None,
|
||||||
|
identifier=None,
|
||||||
|
file_path="/book.epub",
|
||||||
|
file_sha256="a" * 64,
|
||||||
|
file_mtime=datetime.now(tz=UTC),
|
||||||
|
file_size=10,
|
||||||
|
)
|
||||||
|
session.add(source)
|
||||||
|
session.flush()
|
||||||
|
chapter = EbookChapter(source_id=source.id, spine_index=0, title="Chapter", href=None)
|
||||||
|
session.add(chapter)
|
||||||
|
session.flush()
|
||||||
|
session.add(
|
||||||
|
EbookChunk(
|
||||||
|
id=1,
|
||||||
|
source_id=source.id,
|
||||||
|
chapter_id=chapter.id,
|
||||||
|
chunk_index=0,
|
||||||
|
text="content",
|
||||||
|
token_start=0,
|
||||||
|
token_count=1,
|
||||||
|
page_label=None,
|
||||||
|
content_sha256="b" * 64,
|
||||||
|
search_text="Book Author Chapter content",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
records = fetch_bm25_corpus_records(session)
|
||||||
|
|
||||||
|
assert records[0]["bm25_text"] == "Book Author Chapter content"
|
||||||
|
|
||||||
|
|
||||||
def test_reciprocal_rank_fusion_marks_hybrid_source() -> None:
|
def test_reciprocal_rank_fusion_marks_hybrid_source() -> None:
|
||||||
vector_results = [SearchResult(chunk_id=1, text="a", source_title="A")]
|
vector_results = [SearchResult(chunk_id=1, text="a", source_title="A")]
|
||||||
lexical_results = [SearchResult(chunk_id=2, text="b", source_title="B")]
|
lexical_results = [SearchResult(chunk_id=2, text="b", source_title="B")]
|
||||||
@@ -119,7 +161,7 @@ def test_search_ebooks_runs_vector_and_bm25_in_parallel(monkeypatch) -> None:
|
|||||||
def fake_vector_candidates(received_engine, query, _config):
|
def fake_vector_candidates(received_engine, query, _config):
|
||||||
"""Return vector candidates after confirming BM25 has started."""
|
"""Return vector candidates after confirming BM25 has started."""
|
||||||
received_engines.append(received_engine)
|
received_engines.append(received_engine)
|
||||||
assert query == "parallel"
|
assert query == "what is parallel"
|
||||||
vector_started.set()
|
vector_started.set()
|
||||||
assert bm25_started.wait(timeout=2)
|
assert bm25_started.wait(timeout=2)
|
||||||
return [SearchResult(chunk_id=1, text="vector", source_title="Vector", vector_score=0.9)]
|
return [SearchResult(chunk_id=1, text="vector", source_title="Vector", vector_score=0.9)]
|
||||||
@@ -135,13 +177,14 @@ def test_search_ebooks_runs_vector_and_bm25_in_parallel(monkeypatch) -> None:
|
|||||||
monkeypatch.setattr("python.ebook_search.search.bm25_candidates", fake_bm25_candidates)
|
monkeypatch.setattr("python.ebook_search.search.bm25_candidates", fake_bm25_candidates)
|
||||||
config = EbookSearchConfig(rerank=RerankConfig(enabled=False))
|
config = EbookSearchConfig(rerank=RerankConfig(enabled=False))
|
||||||
|
|
||||||
response = search_ebooks(engine, "parallel", config)
|
response = search_ebooks(engine, "what is parallel", config)
|
||||||
|
|
||||||
timings = {step.name: step for step in response.timings}
|
timings = {step.name: step for step in response.timings}
|
||||||
assert [result.chunk_id for result in response.results] == [1, 2]
|
assert [result.chunk_id for result in response.results] == [1, 2]
|
||||||
assert timings["Embedding + vector search"].counts_toward_total is False
|
assert timings["Embedding + vector search"].counts_toward_total is False
|
||||||
assert timings["BM25 search"].counts_toward_total is False
|
assert timings["BM25 search"].counts_toward_total is False
|
||||||
assert timings["Hybrid retrieval"].counts_toward_total is True
|
assert timings["Hybrid retrieval"].counts_toward_total is True
|
||||||
|
assert timings["BM25 query preparation"].counts_toward_total is True
|
||||||
assert received_engines == [engine]
|
assert received_engines == [engine]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ def test_reranking_enabled_reorders_candidates(monkeypatch: pytest.MonkeyPatch)
|
|||||||
results = rerank_chunks("query", candidates(), RerankConfig())
|
results = rerank_chunks("query", candidates(), RerankConfig())
|
||||||
|
|
||||||
assert [result.chunk_id for result in results] == [2, 1, 3]
|
assert [result.chunk_id for result in results] == [2, 1, 3]
|
||||||
assert [round(result.score, 3) for result in results] == [0.45, 0.1, 0.0]
|
assert [round(result.score, 3) for result in results] == [0.78, 0.37, 0.28]
|
||||||
assert [result.rerank_score for result in results] == [0.9, 0.1, 0.4]
|
assert [result.rerank_score for result in results] == [0.9, 0.1, 0.4]
|
||||||
|
|
||||||
|
|
||||||
@@ -100,8 +100,8 @@ def test_reranking_cannot_ignore_hybrid_score(monkeypatch: pytest.MonkeyPatch) -
|
|||||||
results = rerank_chunks("query", candidates, RerankConfig())
|
results = rerank_chunks("query", candidates, RerankConfig())
|
||||||
|
|
||||||
assert [result.chunk_id for result in results] == [1, 2]
|
assert [result.chunk_id for result in results] == [1, 2]
|
||||||
assert results[0].score == 0.7
|
assert results[0].score == pytest.approx(0.79)
|
||||||
assert results[1].score == 0.0
|
assert results[1].score == 0.7
|
||||||
assert results[1].rerank_score == 1.0
|
assert results[1].rerank_score == 1.0
|
||||||
|
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ def test_malformed_vllm_rerank_json_does_not_crash_search(monkeypatch: pytest.Mo
|
|||||||
|
|
||||||
results = rerank_chunks("query", candidates()[:1], RerankConfig())
|
results = rerank_chunks("query", candidates()[:1], RerankConfig())
|
||||||
|
|
||||||
assert results[0].score == 0.0
|
assert results[0].score == 0.3
|
||||||
|
|
||||||
|
|
||||||
def test_vllm_rerank_scores_are_clamped(monkeypatch: pytest.MonkeyPatch) -> None:
|
def test_vllm_rerank_scores_are_clamped(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
@@ -147,4 +147,4 @@ def test_vllm_rerank_scores_are_clamped(monkeypatch: pytest.MonkeyPatch) -> None
|
|||||||
|
|
||||||
results = rerank_chunks("query", candidates()[:2], RerankConfig())
|
results = rerank_chunks("query", candidates()[:2], RerankConfig())
|
||||||
|
|
||||||
assert [result.rerank_score for result in results] == [0.0, 1.0]
|
assert {result.chunk_id: result.rerank_score for result in results} == {1: 0.0, 2: 1.0}
|
||||||
|
|||||||
Reference in New Issue
Block a user