From 6bb6f935b1329040ab4b0e72555f88920bd239f8 Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Fri, 12 Jun 2026 13:47:43 -0400 Subject: [PATCH] added proper cache invalidation to load_bm25_corpus --- python/ebook_search/api/bm25_tasks.py | 4 +++- python/ebook_search/bm25_corpus.py | 12 +++--------- tests/test_ebook_search_ui.py | 24 ++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/python/ebook_search/api/bm25_tasks.py b/python/ebook_search/api/bm25_tasks.py index a211d45..ff24b85 100644 --- a/python/ebook_search/api/bm25_tasks.py +++ b/python/ebook_search/api/bm25_tasks.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING from sqlalchemy.orm import Session -from python.ebook_search.bm25_corpus import refresh_bm25_corpus +from python.ebook_search.bm25_corpus import load_bm25_corpus, refresh_bm25_corpus if TYPE_CHECKING: from fastapi import FastAPI @@ -56,3 +56,5 @@ def refresh_bm25_for_engine(engine: Engine, config: EbookSearchConfig) -> None: """Refresh the BM25 corpus using a SQLAlchemy engine.""" with Session(engine) as session: refresh_bm25_corpus(session, config) + load_bm25_corpus.cache_clear() + logger.info("ebook_bm25_corpus_cache_cleared_after_refresh") diff --git a/python/ebook_search/bm25_corpus.py b/python/ebook_search/bm25_corpus.py index 6c8a15d..2d3904f 100644 --- a/python/ebook_search/bm25_corpus.py +++ b/python/ebook_search/bm25_corpus.py @@ -108,11 +108,10 @@ def refresh_bm25_corpus( ) write_bm25_corpus(index_path, records, manifest) logger.info( - "ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s note=%s", + "ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s", index_path, manifest.chunk_count, manifest.created_at.isoformat(), - "restart_service_to_use_refreshed_bm25_cache", ) return manifest @@ -121,15 +120,10 @@ def refresh_bm25_corpus( def load_bm25_corpus(config: EbookSearchConfig) -> BM25Corpus: """Load the BM25 corpus into memory once per process. - This cache intentionally does not notice later on-disk corpus refreshes. Restart the service after rebuilding the - BM25 corpus for searches to use the new index. + Background refresh tasks clear this cache after rebuilding the on-disk corpus. """ index_path = bm25_index_path(config) - logger.info( - "ebook_bm25_corpus_cache_load path=%s note=%s", - index_path, - "restart_service_after_bm25_refresh", - ) + logger.info("ebook_bm25_corpus_cache_load path=%s", index_path) manifest = read_bm25_manifest(index_path) if manifest is None or not bm25_index_exists(index_path, manifest): msg = f"BM25 corpus is not available: {index_path}" diff --git a/tests/test_ebook_search_ui.py b/tests/test_ebook_search_ui.py index 03b31b2..3de8eac 100644 --- a/tests/test_ebook_search_ui.py +++ b/tests/test_ebook_search_ui.py @@ -5,6 +5,7 @@ from __future__ import annotations from fastapi.testclient import TestClient from sqlalchemy import create_engine +from python.ebook_search.api.bm25_tasks import refresh_bm25_for_engine from python.ebook_search.api.main import create_app from python.ebook_search.config import EbookSearchConfig, RerankConfig from python.ebook_search.embeddings import EmbeddingModelStats @@ -232,6 +233,29 @@ def test_ui_scan_schedules_bm25_refresh_after_database_change(monkeypatch) -> No assert scheduled is True +def test_bm25_refresh_clears_loaded_corpus_cache(monkeypatch) -> None: + refreshed: list[object] = [] + cache_cleared = False + + def fake_refresh_bm25_corpus(session, config): + refreshed.append((session, config)) + + def fake_cache_clear(): + nonlocal cache_cleared + cache_cleared = True + + monkeypatch.setattr("python.ebook_search.api.bm25_tasks.refresh_bm25_corpus", fake_refresh_bm25_corpus) + monkeypatch.setattr("python.ebook_search.api.bm25_tasks.load_bm25_corpus.cache_clear", fake_cache_clear) + engine = create_engine("sqlite+pysqlite:///:memory:", future=True) + config = EbookSearchConfig(rerank=RerankConfig(enabled=False)) + + refresh_bm25_for_engine(engine, config) + + assert len(refreshed) == 1 + assert refreshed[0][1] == config + assert cache_cleared is True + + def test_admin_page_shows_embedding_counts_by_model(monkeypatch) -> None: def fake_embedding_model_stats(_session): return [