improved BM25 write
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
from datetime import UTC, datetime
|
||||
from os import environ
|
||||
@@ -21,6 +22,9 @@ from python.ebook_search.bm25_corpus import (
|
||||
ensure_bm25_corpus,
|
||||
fetch_bm25_corpus_records,
|
||||
load_bm25_corpus,
|
||||
read_bm25_manifest,
|
||||
score_bm25_corpus,
|
||||
write_bm25_corpus,
|
||||
)
|
||||
from python.ebook_search.config import EbookSearchConfig, RerankConfig, load_config, normalize_embedding_model
|
||||
from python.ebook_search.embeddings import MODEL_DIMENSIONS, ensure_embedding_models
|
||||
@@ -123,9 +127,11 @@ def test_bm25_corpus_uses_existing_search_text_without_duplicate_metadata() -> N
|
||||
)
|
||||
session.commit()
|
||||
|
||||
records = fetch_bm25_corpus_records(session)
|
||||
records, texts = fetch_bm25_corpus_records(session)
|
||||
|
||||
assert records[0]["bm25_text"] == "Book Author Chapter content"
|
||||
assert texts == ["Book Author Chapter content"]
|
||||
assert records[0]["chunk_id"] == 1
|
||||
assert "bm25_text" not in records[0]
|
||||
|
||||
|
||||
def test_reciprocal_rank_fusion_marks_hybrid_source() -> None:
|
||||
@@ -227,15 +233,106 @@ def test_bm25_candidates_scores_whole_corpus(monkeypatch) -> None:
|
||||
assert [result.bm25_score for result in results] == [1.5]
|
||||
|
||||
|
||||
def test_bm25_candidates_raises_when_corpus_is_unavailable(monkeypatch) -> None:
|
||||
def test_bm25_candidates_returns_empty_when_corpus_is_unavailable(monkeypatch, caplog) -> None:
|
||||
def fake_load_bm25_corpus(_config):
|
||||
raise BM25CorpusUnavailableError
|
||||
|
||||
monkeypatch.setattr("python.ebook_search.search.load_bm25_corpus", fake_load_bm25_corpus)
|
||||
config = EbookSearchConfig(rerank=RerankConfig(enabled=False))
|
||||
|
||||
with pytest.raises(BM25CorpusUnavailableError):
|
||||
bm25_candidates("high", config)
|
||||
with caplog.at_level(logging.WARNING):
|
||||
results = bm25_candidates("high", config)
|
||||
|
||||
assert results == []
|
||||
assert "ebook_bm25_index_unavailable_skipping" in caplog.text
|
||||
|
||||
|
||||
def test_write_bm25_corpus_publishes_dated_generation(tmp_path) -> None:
|
||||
index_path = tmp_path / "bm25"
|
||||
index_path.mkdir()
|
||||
generations_path = index_path / "generations"
|
||||
generations_path.mkdir()
|
||||
old_generation = generations_path / "20260101T000000.000000Z"
|
||||
old_generation.mkdir()
|
||||
(old_generation / "sentinel").write_text("old", encoding="utf-8")
|
||||
(index_path / "current").symlink_to(Path("generations") / old_generation.name, target_is_directory=True)
|
||||
manifest = BM25Manifest(
|
||||
created_at=datetime(2026, 6, 12, 1, 2, 3, 456789, tzinfo=UTC),
|
||||
db_updated_at=None,
|
||||
chunk_count=0,
|
||||
)
|
||||
|
||||
write_bm25_corpus(index_path, [], [], manifest)
|
||||
|
||||
current_path = index_path / "current"
|
||||
assert current_path.is_symlink()
|
||||
assert current_path.readlink() == generations_path / "20260612T010203.456789Z"
|
||||
assert old_generation.is_dir()
|
||||
assert (old_generation / "sentinel").read_text(encoding="utf-8") == "old"
|
||||
assert (generations_path / "20260612T010203.456789Z").is_dir()
|
||||
assert read_bm25_manifest(index_path) == manifest
|
||||
|
||||
|
||||
def test_write_bm25_corpus_keeps_current_generation_when_publish_fails(monkeypatch, tmp_path) -> None:
|
||||
index_path = tmp_path / "bm25"
|
||||
index_path.mkdir()
|
||||
generations_path = index_path / "generations"
|
||||
generations_path.mkdir()
|
||||
old_generation = generations_path / "20260101T000000.000000Z"
|
||||
old_generation.mkdir()
|
||||
(old_generation / "sentinel").write_text("old", encoding="utf-8")
|
||||
current_path = index_path / "current"
|
||||
current_path.symlink_to(Path("generations") / old_generation.name, target_is_directory=True)
|
||||
original_replace = Path.replace
|
||||
|
||||
def fail_current_replace(self, target):
|
||||
if self.parent == index_path and self.name.startswith(".current.") and target == current_path:
|
||||
msg = "current publish failed"
|
||||
raise OSError(msg)
|
||||
return original_replace(self, target)
|
||||
|
||||
monkeypatch.setattr(Path, "replace", fail_current_replace)
|
||||
manifest = BM25Manifest(
|
||||
created_at=datetime(2026, 6, 12, 1, 2, 3, 456789, tzinfo=UTC),
|
||||
db_updated_at=None,
|
||||
chunk_count=0,
|
||||
)
|
||||
|
||||
with pytest.raises(OSError, match="current publish failed"):
|
||||
write_bm25_corpus(index_path, [], [], manifest)
|
||||
|
||||
assert current_path.readlink() == Path("generations") / old_generation.name
|
||||
assert (old_generation / "sentinel").read_text(encoding="utf-8") == "old"
|
||||
assert not (generations_path / "20260612T010203.456789Z").exists()
|
||||
|
||||
|
||||
def test_load_bm25_corpus_uses_current_generation(tmp_path) -> None:
|
||||
load_bm25_corpus.cache_clear()
|
||||
index_path = tmp_path / "bm25"
|
||||
manifest = BM25Manifest(
|
||||
created_at=datetime(2026, 6, 12, 1, 2, 3, 456789, tzinfo=UTC),
|
||||
db_updated_at=None,
|
||||
chunk_count=1,
|
||||
)
|
||||
record = {
|
||||
"chunk_id": 2,
|
||||
"text": "cached",
|
||||
"source_title": "B",
|
||||
"source_author": None,
|
||||
"chapter_title": None,
|
||||
"page_label": None,
|
||||
}
|
||||
write_bm25_corpus(index_path, [record], ["cached phrase"], manifest)
|
||||
config = EbookSearchConfig(rerank=RerankConfig(enabled=False), bm25_index_dir=str(index_path))
|
||||
|
||||
try:
|
||||
corpus = load_bm25_corpus(config)
|
||||
finally:
|
||||
load_bm25_corpus.cache_clear()
|
||||
|
||||
assert corpus.manifest == manifest
|
||||
assert corpus.records[0]["chunk_id"] == 2
|
||||
assert score_bm25_corpus("cached", corpus, limit=10)
|
||||
|
||||
|
||||
def test_load_bm25_corpus_caches_disk_load(monkeypatch, tmp_path) -> None:
|
||||
|
||||
Reference in New Issue
Block a user