added proper cache invalidation to load_bm25_corpus

This commit is contained in:
2026-06-12 13:47:43 -04:00
parent c4e8a395d2
commit 6bb6f935b1
3 changed files with 30 additions and 10 deletions
+3 -1
View File
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING
from sqlalchemy.orm import Session
from python.ebook_search.bm25_corpus import refresh_bm25_corpus
from python.ebook_search.bm25_corpus import load_bm25_corpus, refresh_bm25_corpus
if TYPE_CHECKING:
from fastapi import FastAPI
@@ -56,3 +56,5 @@ def refresh_bm25_for_engine(engine: Engine, config: EbookSearchConfig) -> None:
"""Refresh the BM25 corpus using a SQLAlchemy engine."""
with Session(engine) as session:
refresh_bm25_corpus(session, config)
load_bm25_corpus.cache_clear()
logger.info("ebook_bm25_corpus_cache_cleared_after_refresh")
+3 -9
View File
@@ -108,11 +108,10 @@ def refresh_bm25_corpus(
)
write_bm25_corpus(index_path, records, manifest)
logger.info(
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s note=%s",
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s",
index_path,
manifest.chunk_count,
manifest.created_at.isoformat(),
"restart_service_to_use_refreshed_bm25_cache",
)
return manifest
@@ -121,15 +120,10 @@ def refresh_bm25_corpus(
def load_bm25_corpus(config: EbookSearchConfig) -> BM25Corpus:
"""Load the BM25 corpus into memory once per process.
This cache intentionally does not notice later on-disk corpus refreshes. Restart the service after rebuilding the
BM25 corpus for searches to use the new index.
Background refresh tasks clear this cache after rebuilding the on-disk corpus.
"""
index_path = bm25_index_path(config)
logger.info(
"ebook_bm25_corpus_cache_load path=%s note=%s",
index_path,
"restart_service_after_bm25_refresh",
)
logger.info("ebook_bm25_corpus_cache_load path=%s", index_path)
manifest = read_bm25_manifest(index_path)
if manifest is None or not bm25_index_exists(index_path, manifest):
msg = f"BM25 corpus is not available: {index_path}"