From 07a9adfdd59c84ff41c80386ede0c6404557edb2 Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Sat, 13 Jun 2026 20:14:20 -0400 Subject: [PATCH] added a index for the VEctor DB --- ...ook_embedding_cosine_index_c460105682d2.py | 54 +++++++++++++++++++ python/orm/richie/ebook.py | 12 ++++- tests/test_ebook_search_core.py | 18 ++++++- 3 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 python/alembic/richie/versions/2026_06_13-add_1024_ebook_embedding_cosine_index_c460105682d2.py diff --git a/python/alembic/richie/versions/2026_06_13-add_1024_ebook_embedding_cosine_index_c460105682d2.py b/python/alembic/richie/versions/2026_06_13-add_1024_ebook_embedding_cosine_index_c460105682d2.py new file mode 100644 index 0000000..8aadfa3 --- /dev/null +++ b/python/alembic/richie/versions/2026_06_13-add_1024_ebook_embedding_cosine_index_c460105682d2.py @@ -0,0 +1,54 @@ +"""add 1024 ebook embedding cosine index. + +Revision ID: c460105682d2 +Revises: 2db132cace1a +Create Date: 2026-06-13 19:53:45.680289 + +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from alembic import op + +from python.orm import RichieBase + +if TYPE_CHECKING: + from collections.abc import Sequence + +# revision identifiers, used by Alembic. +revision: str = "c460105682d2" +down_revision: str | None = "2db132cace1a" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +schema = RichieBase.schema_name + + +def upgrade() -> None: + """Upgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_index( + "ix_ebook_chunk_embedding_1024_embedding_cosine", + "ebook_chunk_embedding_1024", + ["embedding"], + unique=False, + schema=schema, + postgresql_using="hnsw", + postgresql_ops={"embedding": "vector_cosine_ops"}, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + "ix_ebook_chunk_embedding_1024_embedding_cosine", + table_name="ebook_chunk_embedding_1024", + schema=schema, + postgresql_using="hnsw", + postgresql_ops={"embedding": "vector_cosine_ops"}, + ) + # ### end Alembic commands ### diff --git a/python/orm/richie/ebook.py b/python/orm/richie/ebook.py index 9c1e4ad..8e32409 100644 --- a/python/orm/richie/ebook.py +++ b/python/orm/richie/ebook.py @@ -5,7 +5,7 @@ from __future__ import annotations from datetime import datetime from pgvector.sqlalchemy import Vector -from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, String, UniqueConstraint +from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, Index, String, UniqueConstraint from sqlalchemy.orm import Mapped, mapped_column, relationship from python.orm.richie.base import TableBase, TableBaseBig @@ -101,7 +101,15 @@ class EbookChunkEmbedding1024(TableBaseBig): """1024-dimensional chunk embedding.""" __tablename__ = "ebook_chunk_embedding_1024" - __table_args__ = (UniqueConstraint("chunk_id", "model_id"),) + __table_args__ = ( + UniqueConstraint("chunk_id", "model_id"), + Index( + "ix_ebook_chunk_embedding_1024_embedding_cosine", + "embedding", + postgresql_using="hnsw", + postgresql_ops={"embedding": "vector_cosine_ops"}, + ), + ) chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE")) model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE")) diff --git a/tests/test_ebook_search_core.py b/tests/test_ebook_search_core.py index 76345ec..08b1ed8 100644 --- a/tests/test_ebook_search_core.py +++ b/tests/test_ebook_search_core.py @@ -38,7 +38,14 @@ from python.ebook_search.search import ( search_ebooks, ) from python.ebook_search.timing import RuntimeStep -from python.orm.richie import EbookChapter, EbookChunk, EbookEmbeddingModel, EbookSource, RichieBase +from python.orm.richie import ( + EbookChapter, + EbookChunk, + EbookChunkEmbedding1024, + EbookEmbeddingModel, + EbookSource, + RichieBase, +) def test_chunk_text_uses_overlap() -> None: @@ -464,6 +471,15 @@ def test_ensure_embedding_models_registers_service_names() -> None: ] +def test_1024_embedding_table_has_cosine_hnsw_index() -> None: + indexes = {index.name: index for index in EbookChunkEmbedding1024.__table__.indexes} + index = indexes["ix_ebook_chunk_embedding_1024_embedding_cosine"] + + assert [column.name for column in index.columns] == ["embedding"] + assert index.dialect_options["postgresql"]["using"] == "hnsw" + assert index.dialect_options["postgresql"]["ops"] == {"embedding": "vector_cosine_ops"} + + def test_embedding_model_aliases_normalize_to_provider_names() -> None: assert normalize_embedding_model() == "qwen3-embedding-0.6b"