book search engine #18
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING
|
|||||||
|
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from python.ebook_search.bm25_corpus import refresh_bm25_corpus
|
from python.ebook_search.bm25_corpus import load_bm25_corpus, refresh_bm25_corpus
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
@@ -56,3 +56,5 @@ def refresh_bm25_for_engine(engine: Engine, config: EbookSearchConfig) -> None:
|
|||||||
"""Refresh the BM25 corpus using a SQLAlchemy engine."""
|
"""Refresh the BM25 corpus using a SQLAlchemy engine."""
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
refresh_bm25_corpus(session, config)
|
refresh_bm25_corpus(session, config)
|
||||||
|
load_bm25_corpus.cache_clear()
|
||||||
|
logger.info("ebook_bm25_corpus_cache_cleared_after_refresh")
|
||||||
|
|||||||
@@ -108,11 +108,10 @@ def refresh_bm25_corpus(
|
|||||||
)
|
)
|
||||||
write_bm25_corpus(index_path, records, manifest)
|
write_bm25_corpus(index_path, records, manifest)
|
||||||
logger.info(
|
logger.info(
|
||||||
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s note=%s",
|
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s",
|
||||||
index_path,
|
index_path,
|
||||||
manifest.chunk_count,
|
manifest.chunk_count,
|
||||||
manifest.created_at.isoformat(),
|
manifest.created_at.isoformat(),
|
||||||
"restart_service_to_use_refreshed_bm25_cache",
|
|
||||||
)
|
)
|
||||||
return manifest
|
return manifest
|
||||||
|
|
||||||
@@ -121,15 +120,10 @@ def refresh_bm25_corpus(
|
|||||||
def load_bm25_corpus(config: EbookSearchConfig) -> BM25Corpus:
|
def load_bm25_corpus(config: EbookSearchConfig) -> BM25Corpus:
|
||||||
"""Load the BM25 corpus into memory once per process.
|
"""Load the BM25 corpus into memory once per process.
|
||||||
|
|
||||||
This cache intentionally does not notice later on-disk corpus refreshes. Restart the service after rebuilding the
|
Background refresh tasks clear this cache after rebuilding the on-disk corpus.
|
||||||
BM25 corpus for searches to use the new index.
|
|
||||||
"""
|
"""
|
||||||
index_path = bm25_index_path(config)
|
index_path = bm25_index_path(config)
|
||||||
logger.info(
|
logger.info("ebook_bm25_corpus_cache_load path=%s", index_path)
|
||||||
"ebook_bm25_corpus_cache_load path=%s note=%s",
|
|
||||||
index_path,
|
|
||||||
"restart_service_after_bm25_refresh",
|
|
||||||
)
|
|
||||||
manifest = read_bm25_manifest(index_path)
|
manifest = read_bm25_manifest(index_path)
|
||||||
if manifest is None or not bm25_index_exists(index_path, manifest):
|
if manifest is None or not bm25_index_exists(index_path, manifest):
|
||||||
msg = f"BM25 corpus is not available: {index_path}"
|
msg = f"BM25 corpus is not available: {index_path}"
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
|
|
||||||
|
from python.ebook_search.api.bm25_tasks import refresh_bm25_for_engine
|
||||||
from python.ebook_search.api.main import create_app
|
from python.ebook_search.api.main import create_app
|
||||||
from python.ebook_search.config import EbookSearchConfig, RerankConfig
|
from python.ebook_search.config import EbookSearchConfig, RerankConfig
|
||||||
from python.ebook_search.embeddings import EmbeddingModelStats
|
from python.ebook_search.embeddings import EmbeddingModelStats
|
||||||
@@ -232,6 +233,29 @@ def test_ui_scan_schedules_bm25_refresh_after_database_change(monkeypatch) -> No
|
|||||||
assert scheduled is True
|
assert scheduled is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_bm25_refresh_clears_loaded_corpus_cache(monkeypatch) -> None:
|
||||||
|
refreshed: list[object] = []
|
||||||
|
cache_cleared = False
|
||||||
|
|
||||||
|
def fake_refresh_bm25_corpus(session, config):
|
||||||
|
refreshed.append((session, config))
|
||||||
|
|
||||||
|
def fake_cache_clear():
|
||||||
|
nonlocal cache_cleared
|
||||||
|
cache_cleared = True
|
||||||
|
|
||||||
|
monkeypatch.setattr("python.ebook_search.api.bm25_tasks.refresh_bm25_corpus", fake_refresh_bm25_corpus)
|
||||||
|
monkeypatch.setattr("python.ebook_search.api.bm25_tasks.load_bm25_corpus.cache_clear", fake_cache_clear)
|
||||||
|
engine = create_engine("sqlite+pysqlite:///:memory:", future=True)
|
||||||
|
config = EbookSearchConfig(rerank=RerankConfig(enabled=False))
|
||||||
|
|
||||||
|
refresh_bm25_for_engine(engine, config)
|
||||||
|
|
||||||
|
assert len(refreshed) == 1
|
||||||
|
assert refreshed[0][1] == config
|
||||||
|
assert cache_cleared is True
|
||||||
|
|
||||||
|
|
||||||
def test_admin_page_shows_embedding_counts_by_model(monkeypatch) -> None:
|
def test_admin_page_shows_embedding_counts_by_model(monkeypatch) -> None:
|
||||||
def fake_embedding_model_stats(_session):
|
def fake_embedding_model_stats(_session):
|
||||||
return [
|
return [
|
||||||
|
|||||||
Reference in New Issue
Block a user