book search engine #18
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from python.ebook_search.bm25_corpus import refresh_bm25_corpus
|
||||
from python.ebook_search.bm25_corpus import load_bm25_corpus, refresh_bm25_corpus
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fastapi import FastAPI
|
||||
@@ -56,3 +56,5 @@ def refresh_bm25_for_engine(engine: Engine, config: EbookSearchConfig) -> None:
|
||||
"""Refresh the BM25 corpus using a SQLAlchemy engine."""
|
||||
with Session(engine) as session:
|
||||
refresh_bm25_corpus(session, config)
|
||||
load_bm25_corpus.cache_clear()
|
||||
logger.info("ebook_bm25_corpus_cache_cleared_after_refresh")
|
||||
|
||||
@@ -108,11 +108,10 @@ def refresh_bm25_corpus(
|
||||
)
|
||||
write_bm25_corpus(index_path, records, manifest)
|
||||
logger.info(
|
||||
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s note=%s",
|
||||
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s",
|
||||
index_path,
|
||||
manifest.chunk_count,
|
||||
manifest.created_at.isoformat(),
|
||||
"restart_service_to_use_refreshed_bm25_cache",
|
||||
)
|
||||
return manifest
|
||||
|
||||
@@ -121,15 +120,10 @@ def refresh_bm25_corpus(
|
||||
def load_bm25_corpus(config: EbookSearchConfig) -> BM25Corpus:
|
||||
"""Load the BM25 corpus into memory once per process.
|
||||
|
||||
This cache intentionally does not notice later on-disk corpus refreshes. Restart the service after rebuilding the
|
||||
BM25 corpus for searches to use the new index.
|
||||
Background refresh tasks clear this cache after rebuilding the on-disk corpus.
|
||||
"""
|
||||
index_path = bm25_index_path(config)
|
||||
logger.info(
|
||||
"ebook_bm25_corpus_cache_load path=%s note=%s",
|
||||
index_path,
|
||||
"restart_service_after_bm25_refresh",
|
||||
)
|
||||
logger.info("ebook_bm25_corpus_cache_load path=%s", index_path)
|
||||
manifest = read_bm25_manifest(index_path)
|
||||
if manifest is None or not bm25_index_exists(index_path, manifest):
|
||||
msg = f"BM25 corpus is not available: {index_path}"
|
||||
|
||||
@@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
from fastapi.testclient import TestClient
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from python.ebook_search.api.bm25_tasks import refresh_bm25_for_engine
|
||||
from python.ebook_search.api.main import create_app
|
||||
from python.ebook_search.config import EbookSearchConfig, RerankConfig
|
||||
from python.ebook_search.embeddings import EmbeddingModelStats
|
||||
@@ -232,6 +233,29 @@ def test_ui_scan_schedules_bm25_refresh_after_database_change(monkeypatch) -> No
|
||||
assert scheduled is True
|
||||
|
||||
|
||||
def test_bm25_refresh_clears_loaded_corpus_cache(monkeypatch) -> None:
|
||||
refreshed: list[object] = []
|
||||
cache_cleared = False
|
||||
|
||||
def fake_refresh_bm25_corpus(session, config):
|
||||
refreshed.append((session, config))
|
||||
|
||||
def fake_cache_clear():
|
||||
nonlocal cache_cleared
|
||||
cache_cleared = True
|
||||
|
||||
monkeypatch.setattr("python.ebook_search.api.bm25_tasks.refresh_bm25_corpus", fake_refresh_bm25_corpus)
|
||||
monkeypatch.setattr("python.ebook_search.api.bm25_tasks.load_bm25_corpus.cache_clear", fake_cache_clear)
|
||||
engine = create_engine("sqlite+pysqlite:///:memory:", future=True)
|
||||
config = EbookSearchConfig(rerank=RerankConfig(enabled=False))
|
||||
|
||||
refresh_bm25_for_engine(engine, config)
|
||||
|
||||
assert len(refreshed) == 1
|
||||
assert refreshed[0][1] == config
|
||||
assert cache_cleared is True
|
||||
|
||||
|
||||
def test_admin_page_shows_embedding_counts_by_model(monkeypatch) -> None:
|
||||
def fake_embedding_model_stats(_session):
|
||||
return [
|
||||
|
||||
Reference in New Issue
Block a user