book search engine #18

Open
Richie wants to merge 29 commits from feature/book-search-engine into main
3 changed files with 81 additions and 3 deletions
Showing only changes of commit 5e2252641d - Show all commits
@@ -0,0 +1,54 @@
"""add 1024 ebook embedding cosine index.
Revision ID: c460105682d2
Revises: 2db132cace1a
Create Date: 2026-06-13 19:53:45.680289
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from alembic import op
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "c460105682d2"
down_revision: str | None = "2db132cace1a"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_index(
"ix_ebook_chunk_embedding_1024_embedding_cosine",
"ebook_chunk_embedding_1024",
["embedding"],
unique=False,
schema=schema,
postgresql_using="hnsw",
postgresql_ops={"embedding": "vector_cosine_ops"},
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(
"ix_ebook_chunk_embedding_1024_embedding_cosine",
table_name="ebook_chunk_embedding_1024",
schema=schema,
postgresql_using="hnsw",
postgresql_ops={"embedding": "vector_cosine_ops"},
)
# ### end Alembic commands ###
+10 -2
View File
@@ -5,7 +5,7 @@ from __future__ import annotations
from datetime import datetime from datetime import datetime
from pgvector.sqlalchemy import Vector from pgvector.sqlalchemy import Vector
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, String, UniqueConstraint from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, Index, String, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.richie.base import TableBase, TableBaseBig from python.orm.richie.base import TableBase, TableBaseBig
@@ -101,7 +101,15 @@ class EbookChunkEmbedding1024(TableBaseBig):
"""1024-dimensional chunk embedding.""" """1024-dimensional chunk embedding."""
__tablename__ = "ebook_chunk_embedding_1024" __tablename__ = "ebook_chunk_embedding_1024"
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),) __table_args__ = (
UniqueConstraint("chunk_id", "model_id"),
Index(
"ix_ebook_chunk_embedding_1024_embedding_cosine",
"embedding",
postgresql_using="hnsw",
postgresql_ops={"embedding": "vector_cosine_ops"},
),
)
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE")) chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE")) model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
+17 -1
View File
@@ -38,7 +38,14 @@ from python.ebook_search.search import (
search_ebooks, search_ebooks,
) )
from python.ebook_search.timing import RuntimeStep from python.ebook_search.timing import RuntimeStep
from python.orm.richie import EbookChapter, EbookChunk, EbookEmbeddingModel, EbookSource, RichieBase from python.orm.richie import (
EbookChapter,
EbookChunk,
EbookChunkEmbedding1024,
EbookEmbeddingModel,
EbookSource,
RichieBase,
)
def test_chunk_text_uses_overlap() -> None: def test_chunk_text_uses_overlap() -> None:
@@ -464,6 +471,15 @@ def test_ensure_embedding_models_registers_service_names() -> None:
] ]
def test_1024_embedding_table_has_cosine_hnsw_index() -> None:
indexes = {index.name: index for index in EbookChunkEmbedding1024.__table__.indexes}
index = indexes["ix_ebook_chunk_embedding_1024_embedding_cosine"]
assert [column.name for column in index.columns] == ["embedding"]
assert index.dialect_options["postgresql"]["using"] == "hnsw"
assert index.dialect_options["postgresql"]["ops"] == {"embedding": "vector_cosine_ops"}
def test_embedding_model_aliases_normalize_to_provider_names() -> None: def test_embedding_model_aliases_normalize_to_provider_names() -> None:
assert normalize_embedding_model() == "qwen3-embedding-0.6b" assert normalize_embedding_model() == "qwen3-embedding-0.6b"