diff --git a/python/alembic/richie/versions/2026_06_10-add_ebook_search_tables_2db132cace1a.py b/python/alembic/richie/versions/2026_06_10-add_ebook_search_tables_2db132cace1a.py new file mode 100644 index 0000000..f400d75 --- /dev/null +++ b/python/alembic/richie/versions/2026_06_10-add_ebook_search_tables_2db132cace1a.py @@ -0,0 +1,200 @@ +"""add ebook search tables. + +Revision ID: 2db132cace1a +Revises: b3c60cc5beb5 +Create Date: 2026-06-10 22:10:54.379159 + +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pgvector +import sqlalchemy as sa +from alembic import op + +from python.orm import RichieBase + +if TYPE_CHECKING: + from collections.abc import Sequence + +# revision identifiers, used by Alembic. +revision: str = "2db132cace1a" +down_revision: str | None = "b3c60cc5beb5" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +schema = RichieBase.schema_name + + +def upgrade() -> None: + """Upgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "ebook_embedding_model", + sa.Column("name", sa.String(), nullable=False), + sa.Column("dimension", sa.Integer(), nullable=False), + sa.Column("is_default", sa.Boolean(), nullable=False), + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_embedding_model")), + sa.UniqueConstraint("name", name=op.f("uq_ebook_embedding_model_name")), + schema=schema, + ) + op.create_table( + "ebook_source", + sa.Column("title", sa.String(), nullable=False), + sa.Column("author", sa.String(), nullable=True), + sa.Column("language", sa.String(), nullable=True), + sa.Column("publisher", sa.String(), nullable=True), + sa.Column("identifier", sa.String(), nullable=True), + sa.Column("file_path", sa.String(), nullable=False), + sa.Column("file_sha256", sa.String(length=64), nullable=False), + sa.Column("file_mtime", sa.DateTime(timezone=True), nullable=False), + sa.Column("file_size", sa.BigInteger(), nullable=False), + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_source")), + sa.UniqueConstraint("file_path", name=op.f("uq_ebook_source_file_path")), + sa.UniqueConstraint("file_sha256", name=op.f("uq_ebook_source_file_sha256")), + schema=schema, + ) + op.create_table( + "ebook_chapter", + sa.Column("source_id", sa.Integer(), nullable=False), + sa.Column("spine_index", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=True), + sa.Column("href", sa.String(), nullable=True), + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint( + ["source_id"], + [f"{schema}.ebook_source.id"], + name=op.f("fk_ebook_chapter_source_id_ebook_source"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chapter")), + sa.UniqueConstraint("source_id", "spine_index", name=op.f("uq_ebook_chapter_source_id")), + schema=schema, + ) + op.create_table( + "ebook_chunk", + sa.Column("source_id", sa.Integer(), nullable=False), + sa.Column("chapter_id", sa.Integer(), nullable=True), + sa.Column("chunk_index", sa.Integer(), nullable=False), + sa.Column("text", sa.String(), nullable=False), + sa.Column("token_start", sa.Integer(), nullable=False), + sa.Column("token_count", sa.Integer(), nullable=False), + sa.Column("page_label", sa.String(), nullable=True), + sa.Column("content_sha256", sa.String(length=64), nullable=False), + sa.Column("search_text", sa.String(), nullable=False), + sa.Column("id", sa.BigInteger(), nullable=False), + sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint( + ["chapter_id"], + [f"{schema}.ebook_chapter.id"], + name=op.f("fk_ebook_chunk_chapter_id_ebook_chapter"), + ondelete="SET NULL", + ), + sa.ForeignKeyConstraint( + ["source_id"], + [f"{schema}.ebook_source.id"], + name=op.f("fk_ebook_chunk_source_id_ebook_source"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk")), + sa.UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"), + sa.UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"), + schema=schema, + ) + op.create_table( + "ebook_chunk_embedding_1024", + sa.Column("chunk_id", sa.BigInteger(), nullable=False), + sa.Column("model_id", sa.Integer(), nullable=False), + sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=1024), nullable=False), + sa.Column("id", sa.BigInteger(), nullable=False), + sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint( + ["chunk_id"], + [f"{schema}.ebook_chunk.id"], + name=op.f("fk_ebook_chunk_embedding_1024_chunk_id_ebook_chunk"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["model_id"], + [f"{schema}.ebook_embedding_model.id"], + name=op.f("fk_ebook_chunk_embedding_1024_model_id_ebook_embedding_model"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_1024")), + sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_1024_chunk_id")), + schema=schema, + ) + op.create_table( + "ebook_chunk_embedding_2560", + sa.Column("chunk_id", sa.BigInteger(), nullable=False), + sa.Column("model_id", sa.Integer(), nullable=False), + sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=2560), nullable=False), + sa.Column("id", sa.BigInteger(), nullable=False), + sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint( + ["chunk_id"], + [f"{schema}.ebook_chunk.id"], + name=op.f("fk_ebook_chunk_embedding_2560_chunk_id_ebook_chunk"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["model_id"], + [f"{schema}.ebook_embedding_model.id"], + name=op.f("fk_ebook_chunk_embedding_2560_model_id_ebook_embedding_model"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_2560")), + sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_2560_chunk_id")), + schema=schema, + ) + op.create_table( + "ebook_chunk_embedding_4096", + sa.Column("chunk_id", sa.BigInteger(), nullable=False), + sa.Column("model_id", sa.Integer(), nullable=False), + sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=4096), nullable=False), + sa.Column("id", sa.BigInteger(), nullable=False), + sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint( + ["chunk_id"], + [f"{schema}.ebook_chunk.id"], + name=op.f("fk_ebook_chunk_embedding_4096_chunk_id_ebook_chunk"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["model_id"], + [f"{schema}.ebook_embedding_model.id"], + name=op.f("fk_ebook_chunk_embedding_4096_model_id_ebook_embedding_model"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_4096")), + sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_4096_chunk_id")), + schema=schema, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("ebook_chunk_embedding_4096", schema=schema) + op.drop_table("ebook_chunk_embedding_2560", schema=schema) + op.drop_table("ebook_chunk_embedding_1024", schema=schema) + op.drop_table("ebook_chunk", schema=schema) + op.drop_table("ebook_chapter", schema=schema) + op.drop_table("ebook_source", schema=schema) + op.drop_table("ebook_embedding_model", schema=schema) + # ### end Alembic commands ### diff --git a/python/orm/richie/__init__.py b/python/orm/richie/__init__.py index 47f601f..a28ce7a 100644 --- a/python/orm/richie/__init__.py +++ b/python/orm/richie/__init__.py @@ -11,6 +11,15 @@ from python.orm.richie.contact import ( Need, RelationshipType, ) +from python.orm.richie.ebook import ( + EbookChapter, + EbookChunk, + EbookChunkEmbedding1024, + EbookChunkEmbedding2560, + EbookChunkEmbedding4096, + EbookEmbeddingModel, + EbookSource, +) __all__ = [ "Audiobook", @@ -19,6 +28,13 @@ __all__ = [ "Contact", "ContactNeed", "ContactRelationship", + "EbookChapter", + "EbookChunk", + "EbookChunkEmbedding1024", + "EbookChunkEmbedding2560", + "EbookChunkEmbedding4096", + "EbookEmbeddingModel", + "EbookSource", "Need", "RelationshipType", "RichieBase", diff --git a/python/orm/richie/ebook.py b/python/orm/richie/ebook.py new file mode 100644 index 0000000..9c1e4ad --- /dev/null +++ b/python/orm/richie/ebook.py @@ -0,0 +1,130 @@ +"""EPUB search models.""" + +from __future__ import annotations + +from datetime import datetime + +from pgvector.sqlalchemy import Vector +from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, String, UniqueConstraint +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from python.orm.richie.base import TableBase, TableBaseBig + + +class EbookSource(TableBase): + """One indexed EPUB file.""" + + __tablename__ = "ebook_source" + __table_args__ = ( + UniqueConstraint("file_path"), + UniqueConstraint("file_sha256"), + ) + + title: Mapped[str] + author: Mapped[str | None] + language: Mapped[str | None] + publisher: Mapped[str | None] + identifier: Mapped[str | None] + file_path: Mapped[str] + file_sha256: Mapped[str] = mapped_column(String(64)) + file_mtime: Mapped[datetime] = mapped_column(DateTime(timezone=True)) + file_size: Mapped[int] = mapped_column(BigInteger) + + chapters: Mapped[list[EbookChapter]] = relationship( + "EbookChapter", + back_populates="source", + cascade="all, delete-orphan", + passive_deletes=True, + ) + chunks: Mapped[list[EbookChunk]] = relationship( + "EbookChunk", + back_populates="source", + cascade="all, delete-orphan", + passive_deletes=True, + ) + + +class EbookChapter(TableBase): + """A chapter or spine document inside an EPUB.""" + + __tablename__ = "ebook_chapter" + __table_args__ = (UniqueConstraint("source_id", "spine_index"),) + + source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE")) + spine_index: Mapped[int] + title: Mapped[str | None] + href: Mapped[str | None] + + source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chapters") + chunks: Mapped[list[EbookChunk]] = relationship( + "EbookChunk", + back_populates="chapter", + cascade="all, delete-orphan", + passive_deletes=True, + ) + + +class EbookChunk(TableBaseBig): + """A searchable text chunk.""" + + __tablename__ = "ebook_chunk" + __table_args__ = ( + UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"), + UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"), + ) + + source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE")) + chapter_id: Mapped[int | None] = mapped_column(ForeignKey("main.ebook_chapter.id", ondelete="SET NULL")) + chunk_index: Mapped[int] + text: Mapped[str] + token_start: Mapped[int] + token_count: Mapped[int] + page_label: Mapped[str | None] + content_sha256: Mapped[str] = mapped_column(String(64)) + search_text: Mapped[str] + + source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chunks") + chapter: Mapped[EbookChapter | None] = relationship("EbookChapter", back_populates="chunks") + + +class EbookEmbeddingModel(TableBase): + """A supported embedding model.""" + + __tablename__ = "ebook_embedding_model" + + name: Mapped[str] = mapped_column(String, unique=True) + dimension: Mapped[int] + is_default: Mapped[bool] = mapped_column(Boolean, default=False) + + +class EbookChunkEmbedding1024(TableBaseBig): + """1024-dimensional chunk embedding.""" + + __tablename__ = "ebook_chunk_embedding_1024" + __table_args__ = (UniqueConstraint("chunk_id", "model_id"),) + + chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE")) + model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE")) + embedding: Mapped[list[float]] = mapped_column(Vector(1024)) + + +class EbookChunkEmbedding2560(TableBaseBig): + """2560-dimensional chunk embedding.""" + + __tablename__ = "ebook_chunk_embedding_2560" + __table_args__ = (UniqueConstraint("chunk_id", "model_id"),) + + chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE")) + model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE")) + embedding: Mapped[list[float]] = mapped_column(Vector(2560)) + + +class EbookChunkEmbedding4096(TableBaseBig): + """4096-dimensional chunk embedding.""" + + __tablename__ = "ebook_chunk_embedding_4096" + __table_args__ = (UniqueConstraint("chunk_id", "model_id"),) + + chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE")) + model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE")) + embedding: Mapped[list[float]] = mapped_column(Vector(4096))