diff --git a/overlays/default.nix b/overlays/default.nix index 7e13012..11521f7 100644 --- a/overlays/default.nix +++ b/overlays/default.nix @@ -55,6 +55,7 @@ polars psycopg pydantic + pydantic-settings pyfakefs pytest pytest-cov diff --git a/pyproject.toml b/pyproject.toml index 8903e6c..83fae49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "polars", "psycopg[binary]", "pydantic", + "pydantic-settings", "python-multipart", "sqlalchemy", "tenacity", diff --git a/python/ebook_search/api/routes/admin.py b/python/ebook_search/api/routes/admin.py index 4a14875..5784191 100644 --- a/python/ebook_search/api/routes/admin.py +++ b/python/ebook_search/api/routes/admin.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from dataclasses import replace from fastapi import APIRouter, Request from fastapi.responses import HTMLResponse @@ -17,7 +16,6 @@ from python.ebook_search.ingest import ingest_configured_paths logger = logging.getLogger(__name__) router = APIRouter(prefix="/admin") -EMBED_ALL_BATCH_SIZE = 32 @router.get("", response_class=HTMLResponse) @@ -70,7 +68,7 @@ def embed_all(request: Request) -> HTMLResponse: """Embed all chunks missing vectors in fixed-size batches.""" total = 0 batches = 0 - config = replace(request.app.state.config, embedding_batch_size=EMBED_ALL_BATCH_SIZE) + config = request.app.state.config try: with Session(request.app.state.engine) as session: while True: @@ -103,5 +101,5 @@ def embed_all(request: Request) -> HTMLResponse: return templates.TemplateResponse( request, "partials/admin_status.html", - {"message": f"Embedded {total} chunks in {batches} batches of {EMBED_ALL_BATCH_SIZE}"}, + {"message": f"Embedded {total} chunks in {batches} batches of {config.embedding_batch_size}"}, ) diff --git a/python/ebook_search/config.py b/python/ebook_search/config.py index 8d21274..8a65c6e 100644 --- a/python/ebook_search/config.py +++ b/python/ebook_search/config.py @@ -2,88 +2,15 @@ from __future__ import annotations -from dataclasses import dataclass from os import getenv +from typing import Annotated + +from pydantic import AliasChoices, Field, field_validator +from pydantic_settings import BaseSettings, NoDecode, SettingsConfigDict -def getenv_bool(name: str, *, default: bool) -> bool: - """Read a boolean environment variable with a default fallback.""" - value = getenv(name) - if value is None: - return default - return value.strip().lower() in {"1", "true", "yes", "on"} - - -def getenv_int(name: str, *, default: int) -> int: - """Read an integer environment variable with a default fallback.""" - value = getenv(name) - if value is None or not value.strip(): - return default - return int(value) - - -@dataclass(frozen=True) -class RerankConfig: - """vLLM reranker settings.""" - - enabled: bool = False - base_url: str = "http://192.168.90.25:8001" - model: str = "qwen3-reranker-06b" - candidates: int = 24 - timeout_seconds: float = 30.0 - - -@dataclass(frozen=True) -class EbookSearchConfig: - """Runtime settings for EPUB search.""" - - rerank: RerankConfig - top_k: int = 12 - library_paths: tuple[str, ...] = () - vllm_base_url: str = "https://ollama.com/v1" - vllm_api_key: str = "not-needed" - chat_model: str = "deepseek-v4-flash" - answer_enabled: bool = True - embedding_base_url: str = "http://192.168.90.25:8000/v1" - embedding_api_key: str = "not-needed" - embedding_model: str = "qwen3-embedding-0.6b" - embedding_batch_size: int = 32 - bm25_index_dir: str = ".ebook_search_bm25" - bm25_refresh_delay_seconds: int = 60 - - -def load_rerank_config() -> RerankConfig: - """Load reranker config from environment variables.""" - return RerankConfig( - enabled=getenv_bool("EBOOK_SEARCH_RERANK_ENABLED", default=False), - base_url=getenv("EBOOK_SEARCH_RERANK_BASE_URL", "http://192.168.90.25:8001"), - model=getenv("EBOOK_SEARCH_RERANK_MODEL", "qwen3-reranker-06b"), - candidates=getenv_int("EBOOK_SEARCH_RERANK_CANDIDATES", default=24), - timeout_seconds=float(getenv_int("EBOOK_SEARCH_RERANK_TIMEOUT_SECONDS", default=30)), - ) - - -def load_config() -> EbookSearchConfig: - """Load EPUB search config from environment variables.""" - return EbookSearchConfig( - rerank=load_rerank_config(), - top_k=getenv_int("EBOOK_SEARCH_TOP_K", default=12), - library_paths=library_paths_from_env(), - vllm_base_url=getenv("EBOOK_SEARCH_VLLM_BASE_URL", "https://ollama.com/v1"), - vllm_api_key=getenv("EBOOK_SEARCH_VLLM_API_KEY") or getenv("OLLAMA_API_KEY") or "not-needed", - chat_model=getenv("EBOOK_SEARCH_CHAT_MODEL", "deepseek-v4-flash"), - answer_enabled=getenv_bool("EBOOK_SEARCH_ANSWER_ENABLED", default=True), - embedding_base_url=getenv("EBOOK_SEARCH_EMBEDDING_BASE_URL", "http://192.168.90.25:8000/v1"), - embedding_api_key=getenv("EBOOK_SEARCH_EMBEDDING_API_KEY", "not-needed"), - embedding_model=normalize_embedding_model(), - embedding_batch_size=getenv_int("EBOOK_SEARCH_EMBEDDING_BATCH_SIZE", default=32), - bm25_index_dir=getenv("EBOOK_SEARCH_BM25_INDEX_DIR", ".ebook_search_bm25"), - bm25_refresh_delay_seconds=getenv_int("EBOOK_SEARCH_BM25_REFRESH_DELAY_SECONDS", default=60), - ) - - -def normalize_embedding_model(default: str = "qwen3-embedding-0.6b") -> str: - """Normalize supported embedding aliases to provider model names.""" +def normalize_embedding_alias(model: str) -> str: + """Normalize a supported embedding alias to its provider model name.""" aliases = { "Qwen3-Embedding-0.6B": "qwen3-embedding-0.6b", "Qwen3-Embedding-4B": "qwen3-embedding-4b", @@ -98,20 +25,86 @@ def normalize_embedding_model(default: str = "qwen3-embedding-0.6b") -> str: "qwen3-embedding-4b": "qwen3-embedding-4b", "qwen3-embedding-8b": "qwen3-embedding-8b", } - - model = getenv("EBOOK_SEARCH_EMBEDDING_MODEL", default) standard_model = aliases.get(model) - if standard_model is None: error = f"Embedding model {model} is not supported. Supported models are {aliases.keys()}" raise ValueError(error) - return standard_model -def library_paths_from_env() -> tuple[str, ...]: - """Read configured EPUB library paths from the environment.""" - value = getenv("EBOOK_SEARCH_LIBRARY_PATHS") - if value is None: - return () - return tuple(path for path in value.split(":") if path) +def normalize_embedding_model(default: str = "qwen3-embedding-0.6b") -> str: + """Normalize the configured embedding alias to its provider model name.""" + return normalize_embedding_alias(getenv("EBOOK_SEARCH_EMBEDDING_MODEL", default)) + + +class RerankConfig(BaseSettings): + """vLLM reranker settings.""" + + model_config = SettingsConfigDict(env_prefix="EBOOK_SEARCH_RERANK_", frozen=True, protected_namespaces=()) + + enabled: bool = False + base_url: str = "http://192.168.90.25:8001" + model: str = "qwen3-reranker-06b" + candidates: int = 24 + timeout_seconds: float = 30.0 + score_weight: float = 0.7 + hybrid_weight: float = 0.3 + + +class EbookSearchConfig(BaseSettings): + """Runtime settings for EPUB search.""" + + model_config = SettingsConfigDict( + env_prefix="EBOOK_SEARCH_", + frozen=True, + populate_by_name=True, + protected_namespaces=(), + ) + + rerank: RerankConfig = Field(default_factory=RerankConfig) + top_k: int = 12 + library_paths: Annotated[tuple[str, ...], NoDecode] = () + chunk_tokens: int = 700 + chunk_overlap: int = 100 + vllm_base_url: str = "https://ollama.com/v1" + vllm_api_key: str = Field( + default="not-needed", + validation_alias=AliasChoices("EBOOK_SEARCH_VLLM_API_KEY", "OLLAMA_API_KEY"), + ) + chat_model: str = "deepseek-v4-flash" + answer_enabled: bool = True + embedding_base_url: str = "http://192.168.90.25:8000/v1" + embedding_api_key: str = "not-needed" + embedding_model: str = "qwen3-embedding-0.6b" + embedding_batch_size: int = 32 + embedding_timeout_seconds: float = 60.0 + chat_timeout_seconds: float = 60.0 + vector_candidate_multiplier: int = 4 + bm25_candidate_limit: int = 120 + rrf_rank_constant: int = 60 + bm25_index_dir: str = ".ebook_search_bm25" + bm25_refresh_delay_seconds: int = 60 + + @field_validator("library_paths", mode="before") + @classmethod + def split_library_paths(cls, value: object) -> object: + """Split a colon-separated library path string into a tuple of paths.""" + if isinstance(value, str): + return tuple(path for path in value.split(":") if path) + return value + + @field_validator("embedding_model") + @classmethod + def normalize_embedding(cls, value: str) -> str: + """Normalize the configured embedding alias to its provider model name.""" + return normalize_embedding_alias(value) + + +def load_rerank_config() -> RerankConfig: + """Load reranker config from environment variables.""" + return RerankConfig() + + +def load_config() -> EbookSearchConfig: + """Load EPUB search config from environment variables.""" + return EbookSearchConfig() diff --git a/python/ebook_search/ingest.py b/python/ebook_search/ingest.py index 2b8e44a..eec235f 100644 --- a/python/ebook_search/ingest.py +++ b/python/ebook_search/ingest.py @@ -79,17 +79,17 @@ def ingest_configured_paths(session: Session, config: EbookSearchConfig) -> int: path = Path(library_path).expanduser() logger.info("ebook_ingest_path_start path=%s", path) if path.is_file() and path.suffix.lower() == ".epub": - count += int(ingest_file(session, path)) + count += int(ingest_file(session, path, config)) elif path.is_dir(): for epub_path in sorted(path.rglob("*.epub")): - count += int(ingest_file(session, epub_path)) + count += int(ingest_file(session, epub_path, config)) else: logger.warning("ebook_ingest_path_missing path=%s", path) logger.info("ebook_ingest_paths_complete changed_files=%s configured_paths=%s", count, len(config.library_paths)) return count -def ingest_file(session: Session, path: Path) -> bool: +def ingest_file(session: Session, path: Path, config: EbookSearchConfig) -> bool: """Ingest one EPUB file. Return True when the database changed.""" resolved_path = path.expanduser().resolve() logger.info("ebook_ingest_file_start path=%s", resolved_path) @@ -134,7 +134,7 @@ def ingest_file(session: Session, path: Path) -> bool: ) session.add(chapter) session.flush() - chunk_index = add_chapter_chunks(session, source, chapter, parsed_chapter, chunk_index) + chunk_index = add_chapter_chunks(session, source, chapter, parsed_chapter, chunk_index, config) session.flush() logger.info( @@ -160,10 +160,15 @@ def add_chapter_chunks( chapter: EbookChapter, parsed_chapter: ParsedChapter, chunk_index: int, + config: EbookSearchConfig, ) -> int: """Add chunk rows for one parsed chapter and return the next chunk index.""" page_label = parsed_chapter.page_labels[0] if parsed_chapter.page_labels else None - for text_chunk in chunk_text(parsed_chapter.text): + for text_chunk in chunk_text( + parsed_chapter.text, + chunk_tokens=config.chunk_tokens, + overlap_tokens=config.chunk_overlap, + ): session.add( EbookChunk( source_id=source.id, diff --git a/python/ebook_search/llm_interface.py b/python/ebook_search/llm_interface.py index 8cfa121..39f443e 100644 --- a/python/ebook_search/llm_interface.py +++ b/python/ebook_search/llm_interface.py @@ -29,7 +29,7 @@ def request_embeddings(texts: Sequence[str], config: EbookSearchConfig) -> list[ f"{config.embedding_base_url.rstrip('/')}/embeddings", headers=auth_headers(config.embedding_api_key), json={"model": config.embedding_model, "input": list(texts)}, - timeout=60, + timeout=config.embedding_timeout_seconds, ) response.raise_for_status() return embedding_vectors_from_response(response.json()) @@ -106,7 +106,7 @@ def request_chat_completion( "messages": list(messages), "temperature": 0, }, - timeout=60, + timeout=config.chat_timeout_seconds, ) response.raise_for_status() return chat_content_from_response(response.json()) diff --git a/python/ebook_search/rerank.py b/python/ebook_search/rerank.py index 5075601..05354be 100644 --- a/python/ebook_search/rerank.py +++ b/python/ebook_search/rerank.py @@ -13,8 +13,6 @@ if TYPE_CHECKING: from python.ebook_search.search import SearchResult logger = logging.getLogger(__name__) -RERANK_SCORE_WEIGHT = 0.7 -HYBRID_SCORE_WEIGHT = 0.3 @dataclass(frozen=True) @@ -41,7 +39,7 @@ def rerank_chunks(query: str, candidates: list[SearchResult], config: RerankConf ( replace( result, - score=final_rerank_score(result, scores[result.chunk_id].score, candidates), + score=final_rerank_score(result, scores[result.chunk_id].score, candidates, config), rerank_score=scores[result.chunk_id].score, ) for result in candidates @@ -110,9 +108,14 @@ def clamp_score(score: float) -> float: return min(max(score, 0.0), 1.0) -def final_rerank_score(result: SearchResult, rerank_score: float, candidates: list[SearchResult]) -> float: +def final_rerank_score( + result: SearchResult, + rerank_score: float, + candidates: list[SearchResult], + config: RerankConfig, +) -> float: """Combine rerank relevance with normalized hybrid retrieval evidence.""" - return (RERANK_SCORE_WEIGHT * rerank_score) + (HYBRID_SCORE_WEIGHT * normalized_hybrid_score(result, candidates)) + return (config.score_weight * rerank_score) + (config.hybrid_weight * normalized_hybrid_score(result, candidates)) def normalized_hybrid_score(result: SearchResult, candidates: list[SearchResult]) -> float: diff --git a/python/ebook_search/search.py b/python/ebook_search/search.py index 371387d..db7b1eb 100644 --- a/python/ebook_search/search.py +++ b/python/ebook_search/search.py @@ -35,7 +35,6 @@ if TYPE_CHECKING: from python.ebook_search.config import EbookSearchConfig logger = logging.getLogger(__name__) -BM25_CANDIDATE_LIMIT = 120 @dataclass(frozen=True) @@ -111,6 +110,7 @@ def search_ebooks( reciprocal_rank_fusion, retrieval.vector_results, retrieval.lexical_results, + rank_constant=config.rrf_rank_constant, ) timings.append(timing) if config.rerank.enabled and rerank: @@ -216,7 +216,7 @@ def vector_candidates(engine: Engine, query: str, config: EbookSearchConfig) -> raise ValueError(msg) embedding = embed_query(query, config) - limit = max(config.rerank.candidates, config.top_k) * 4 + limit = max(config.rerank.candidates, config.top_k) * config.vector_candidate_multiplier embedding_table = get_embedding_table(model.dimension) embedding_param = literal(embedding, type_=Vector(model.dimension)) @@ -263,7 +263,7 @@ def bm25_candidates(query: str, config: EbookSearchConfig) -> list[SearchResult] logger.info("ebook_bm25_search_complete corpus=0 candidates=0") return [] - scored_records = score_bm25_corpus(query, corpus, limit=BM25_CANDIDATE_LIMIT) + scored_records = score_bm25_corpus(query, corpus, limit=config.bm25_candidate_limit) results = [ replace(search_result_from_row(record), score=score, vector_score=None, bm25_score=score) for record, score in scored_records diff --git a/tests/test_ebook_search_core.py b/tests/test_ebook_search_core.py index 08b1ed8..a66cef9 100644 --- a/tests/test_ebook_search_core.py +++ b/tests/test_ebook_search_core.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from dataclasses import replace from datetime import UTC, datetime from os import environ from pathlib import Path @@ -528,7 +527,7 @@ def test_chat_api_key_falls_back_to_ollama_api_key(monkeypatch) -> None: def test_answer_query_does_not_call_model_when_disabled() -> None: - config = replace(load_config(), answer_enabled=False) + config = load_config().model_copy(update={"answer_enabled": False}) result = SearchResult(chunk_id=1, text="source text", source_title="Book") answer = answer_query("question", [result], config) diff --git a/users/richie/home/gui/vscode/settings.json b/users/richie/home/gui/vscode/settings.json index a63f8af..a265db8 100644 --- a/users/richie/home/gui/vscode/settings.json +++ b/users/richie/home/gui/vscode/settings.json @@ -80,6 +80,7 @@ "fastapi", "Michal", "Nornsight", + "pydantic", "sandboxing", "syncthing", ],