added proper cache invalidation to load_bm25_corpus

This commit is contained in:
2026-06-12 13:47:43 -04:00
parent 70f24cdbc6
commit c5418b50fd
3 changed files with 30 additions and 10 deletions
+3 -1
View File
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING
from sqlalchemy.orm import Session
from python.ebook_search.bm25_corpus import refresh_bm25_corpus
from python.ebook_search.bm25_corpus import load_bm25_corpus, refresh_bm25_corpus
if TYPE_CHECKING:
from fastapi import FastAPI
@@ -56,3 +56,5 @@ def refresh_bm25_for_engine(engine: Engine, config: EbookSearchConfig) -> None:
"""Refresh the BM25 corpus using a SQLAlchemy engine."""
with Session(engine) as session:
refresh_bm25_corpus(session, config)
load_bm25_corpus.cache_clear()
logger.info("ebook_bm25_corpus_cache_cleared_after_refresh")
+3 -9
View File
@@ -108,11 +108,10 @@ def refresh_bm25_corpus(
)
write_bm25_corpus(index_path, records, manifest)
logger.info(
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s note=%s",
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s",
index_path,
manifest.chunk_count,
manifest.created_at.isoformat(),
"restart_service_to_use_refreshed_bm25_cache",
)
return manifest
@@ -121,15 +120,10 @@ def refresh_bm25_corpus(
def load_bm25_corpus(config: EbookSearchConfig) -> BM25Corpus:
"""Load the BM25 corpus into memory once per process.
This cache intentionally does not notice later on-disk corpus refreshes. Restart the service after rebuilding the
BM25 corpus for searches to use the new index.
Background refresh tasks clear this cache after rebuilding the on-disk corpus.
"""
index_path = bm25_index_path(config)
logger.info(
"ebook_bm25_corpus_cache_load path=%s note=%s",
index_path,
"restart_service_after_bm25_refresh",
)
logger.info("ebook_bm25_corpus_cache_load path=%s", index_path)
manifest = read_bm25_manifest(index_path)
if manifest is None or not bm25_index_exists(index_path, manifest):
msg = f"BM25 corpus is not available: {index_path}"
+24
View File
@@ -5,6 +5,7 @@ from __future__ import annotations
from fastapi.testclient import TestClient
from sqlalchemy import create_engine
from python.ebook_search.api.bm25_tasks import refresh_bm25_for_engine
from python.ebook_search.api.main import create_app
from python.ebook_search.config import EbookSearchConfig, RerankConfig
from python.ebook_search.embeddings import EmbeddingModelStats
@@ -232,6 +233,29 @@ def test_ui_scan_schedules_bm25_refresh_after_database_change(monkeypatch) -> No
assert scheduled is True
def test_bm25_refresh_clears_loaded_corpus_cache(monkeypatch) -> None:
refreshed: list[object] = []
cache_cleared = False
def fake_refresh_bm25_corpus(session, config):
refreshed.append((session, config))
def fake_cache_clear():
nonlocal cache_cleared
cache_cleared = True
monkeypatch.setattr("python.ebook_search.api.bm25_tasks.refresh_bm25_corpus", fake_refresh_bm25_corpus)
monkeypatch.setattr("python.ebook_search.api.bm25_tasks.load_bm25_corpus.cache_clear", fake_cache_clear)
engine = create_engine("sqlite+pysqlite:///:memory:", future=True)
config = EbookSearchConfig(rerank=RerankConfig(enabled=False))
refresh_bm25_for_engine(engine, config)
assert len(refreshed) == 1
assert refreshed[0][1] == config
assert cache_cleared is True
def test_admin_page_shows_embedding_counts_by_model(monkeypatch) -> None:
def fake_embedding_model_stats(_session):
return [