added ebook embedding to orm
This commit is contained in:
@@ -0,0 +1,200 @@
|
|||||||
|
"""add ebook search tables.
|
||||||
|
|
||||||
|
Revision ID: 2db132cace1a
|
||||||
|
Revises: b3c60cc5beb5
|
||||||
|
Create Date: 2026-06-10 22:10:54.379159
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import pgvector
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
from python.orm import RichieBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "2db132cace1a"
|
||||||
|
down_revision: str | None = "b3c60cc5beb5"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
schema = RichieBase.schema_name
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
"""Upgrade."""
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
op.create_table(
|
||||||
|
"ebook_embedding_model",
|
||||||
|
sa.Column("name", sa.String(), nullable=False),
|
||||||
|
sa.Column("dimension", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("is_default", sa.Boolean(), nullable=False),
|
||||||
|
sa.Column("id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_embedding_model")),
|
||||||
|
sa.UniqueConstraint("name", name=op.f("uq_ebook_embedding_model_name")),
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
op.create_table(
|
||||||
|
"ebook_source",
|
||||||
|
sa.Column("title", sa.String(), nullable=False),
|
||||||
|
sa.Column("author", sa.String(), nullable=True),
|
||||||
|
sa.Column("language", sa.String(), nullable=True),
|
||||||
|
sa.Column("publisher", sa.String(), nullable=True),
|
||||||
|
sa.Column("identifier", sa.String(), nullable=True),
|
||||||
|
sa.Column("file_path", sa.String(), nullable=False),
|
||||||
|
sa.Column("file_sha256", sa.String(length=64), nullable=False),
|
||||||
|
sa.Column("file_mtime", sa.DateTime(timezone=True), nullable=False),
|
||||||
|
sa.Column("file_size", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_source")),
|
||||||
|
sa.UniqueConstraint("file_path", name=op.f("uq_ebook_source_file_path")),
|
||||||
|
sa.UniqueConstraint("file_sha256", name=op.f("uq_ebook_source_file_sha256")),
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
op.create_table(
|
||||||
|
"ebook_chapter",
|
||||||
|
sa.Column("source_id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("spine_index", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("title", sa.String(), nullable=True),
|
||||||
|
sa.Column("href", sa.String(), nullable=True),
|
||||||
|
sa.Column("id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["source_id"],
|
||||||
|
[f"{schema}.ebook_source.id"],
|
||||||
|
name=op.f("fk_ebook_chapter_source_id_ebook_source"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chapter")),
|
||||||
|
sa.UniqueConstraint("source_id", "spine_index", name=op.f("uq_ebook_chapter_source_id")),
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
op.create_table(
|
||||||
|
"ebook_chunk",
|
||||||
|
sa.Column("source_id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("chapter_id", sa.Integer(), nullable=True),
|
||||||
|
sa.Column("chunk_index", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("text", sa.String(), nullable=False),
|
||||||
|
sa.Column("token_start", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("token_count", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("page_label", sa.String(), nullable=True),
|
||||||
|
sa.Column("content_sha256", sa.String(length=64), nullable=False),
|
||||||
|
sa.Column("search_text", sa.String(), nullable=False),
|
||||||
|
sa.Column("id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["chapter_id"],
|
||||||
|
[f"{schema}.ebook_chapter.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_chapter_id_ebook_chapter"),
|
||||||
|
ondelete="SET NULL",
|
||||||
|
),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["source_id"],
|
||||||
|
[f"{schema}.ebook_source.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_source_id_ebook_source"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk")),
|
||||||
|
sa.UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"),
|
||||||
|
sa.UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"),
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
op.create_table(
|
||||||
|
"ebook_chunk_embedding_1024",
|
||||||
|
sa.Column("chunk_id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("model_id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=1024), nullable=False),
|
||||||
|
sa.Column("id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["chunk_id"],
|
||||||
|
[f"{schema}.ebook_chunk.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_embedding_1024_chunk_id_ebook_chunk"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["model_id"],
|
||||||
|
[f"{schema}.ebook_embedding_model.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_embedding_1024_model_id_ebook_embedding_model"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_1024")),
|
||||||
|
sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_1024_chunk_id")),
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
op.create_table(
|
||||||
|
"ebook_chunk_embedding_2560",
|
||||||
|
sa.Column("chunk_id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("model_id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=2560), nullable=False),
|
||||||
|
sa.Column("id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["chunk_id"],
|
||||||
|
[f"{schema}.ebook_chunk.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_embedding_2560_chunk_id_ebook_chunk"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["model_id"],
|
||||||
|
[f"{schema}.ebook_embedding_model.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_embedding_2560_model_id_ebook_embedding_model"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_2560")),
|
||||||
|
sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_2560_chunk_id")),
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
op.create_table(
|
||||||
|
"ebook_chunk_embedding_4096",
|
||||||
|
sa.Column("chunk_id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("model_id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=4096), nullable=False),
|
||||||
|
sa.Column("id", sa.BigInteger(), nullable=False),
|
||||||
|
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["chunk_id"],
|
||||||
|
[f"{schema}.ebook_chunk.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_embedding_4096_chunk_id_ebook_chunk"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.ForeignKeyConstraint(
|
||||||
|
["model_id"],
|
||||||
|
[f"{schema}.ebook_embedding_model.id"],
|
||||||
|
name=op.f("fk_ebook_chunk_embedding_4096_model_id_ebook_embedding_model"),
|
||||||
|
ondelete="CASCADE",
|
||||||
|
),
|
||||||
|
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_4096")),
|
||||||
|
sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_4096_chunk_id")),
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
# ### end Alembic commands ###
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
"""Downgrade."""
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
op.drop_table("ebook_chunk_embedding_4096", schema=schema)
|
||||||
|
op.drop_table("ebook_chunk_embedding_2560", schema=schema)
|
||||||
|
op.drop_table("ebook_chunk_embedding_1024", schema=schema)
|
||||||
|
op.drop_table("ebook_chunk", schema=schema)
|
||||||
|
op.drop_table("ebook_chapter", schema=schema)
|
||||||
|
op.drop_table("ebook_source", schema=schema)
|
||||||
|
op.drop_table("ebook_embedding_model", schema=schema)
|
||||||
|
# ### end Alembic commands ###
|
||||||
@@ -11,6 +11,15 @@ from python.orm.richie.contact import (
|
|||||||
Need,
|
Need,
|
||||||
RelationshipType,
|
RelationshipType,
|
||||||
)
|
)
|
||||||
|
from python.orm.richie.ebook import (
|
||||||
|
EbookChapter,
|
||||||
|
EbookChunk,
|
||||||
|
EbookChunkEmbedding1024,
|
||||||
|
EbookChunkEmbedding2560,
|
||||||
|
EbookChunkEmbedding4096,
|
||||||
|
EbookEmbeddingModel,
|
||||||
|
EbookSource,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Audiobook",
|
"Audiobook",
|
||||||
@@ -19,6 +28,13 @@ __all__ = [
|
|||||||
"Contact",
|
"Contact",
|
||||||
"ContactNeed",
|
"ContactNeed",
|
||||||
"ContactRelationship",
|
"ContactRelationship",
|
||||||
|
"EbookChapter",
|
||||||
|
"EbookChunk",
|
||||||
|
"EbookChunkEmbedding1024",
|
||||||
|
"EbookChunkEmbedding2560",
|
||||||
|
"EbookChunkEmbedding4096",
|
||||||
|
"EbookEmbeddingModel",
|
||||||
|
"EbookSource",
|
||||||
"Need",
|
"Need",
|
||||||
"RelationshipType",
|
"RelationshipType",
|
||||||
"RichieBase",
|
"RichieBase",
|
||||||
|
|||||||
@@ -0,0 +1,130 @@
|
|||||||
|
"""EPUB search models."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from pgvector.sqlalchemy import Vector
|
||||||
|
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, String, UniqueConstraint
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
from python.orm.richie.base import TableBase, TableBaseBig
|
||||||
|
|
||||||
|
|
||||||
|
class EbookSource(TableBase):
|
||||||
|
"""One indexed EPUB file."""
|
||||||
|
|
||||||
|
__tablename__ = "ebook_source"
|
||||||
|
__table_args__ = (
|
||||||
|
UniqueConstraint("file_path"),
|
||||||
|
UniqueConstraint("file_sha256"),
|
||||||
|
)
|
||||||
|
|
||||||
|
title: Mapped[str]
|
||||||
|
author: Mapped[str | None]
|
||||||
|
language: Mapped[str | None]
|
||||||
|
publisher: Mapped[str | None]
|
||||||
|
identifier: Mapped[str | None]
|
||||||
|
file_path: Mapped[str]
|
||||||
|
file_sha256: Mapped[str] = mapped_column(String(64))
|
||||||
|
file_mtime: Mapped[datetime] = mapped_column(DateTime(timezone=True))
|
||||||
|
file_size: Mapped[int] = mapped_column(BigInteger)
|
||||||
|
|
||||||
|
chapters: Mapped[list[EbookChapter]] = relationship(
|
||||||
|
"EbookChapter",
|
||||||
|
back_populates="source",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
passive_deletes=True,
|
||||||
|
)
|
||||||
|
chunks: Mapped[list[EbookChunk]] = relationship(
|
||||||
|
"EbookChunk",
|
||||||
|
back_populates="source",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
passive_deletes=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EbookChapter(TableBase):
|
||||||
|
"""A chapter or spine document inside an EPUB."""
|
||||||
|
|
||||||
|
__tablename__ = "ebook_chapter"
|
||||||
|
__table_args__ = (UniqueConstraint("source_id", "spine_index"),)
|
||||||
|
|
||||||
|
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
|
||||||
|
spine_index: Mapped[int]
|
||||||
|
title: Mapped[str | None]
|
||||||
|
href: Mapped[str | None]
|
||||||
|
|
||||||
|
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chapters")
|
||||||
|
chunks: Mapped[list[EbookChunk]] = relationship(
|
||||||
|
"EbookChunk",
|
||||||
|
back_populates="chapter",
|
||||||
|
cascade="all, delete-orphan",
|
||||||
|
passive_deletes=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EbookChunk(TableBaseBig):
|
||||||
|
"""A searchable text chunk."""
|
||||||
|
|
||||||
|
__tablename__ = "ebook_chunk"
|
||||||
|
__table_args__ = (
|
||||||
|
UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"),
|
||||||
|
UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"),
|
||||||
|
)
|
||||||
|
|
||||||
|
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
|
||||||
|
chapter_id: Mapped[int | None] = mapped_column(ForeignKey("main.ebook_chapter.id", ondelete="SET NULL"))
|
||||||
|
chunk_index: Mapped[int]
|
||||||
|
text: Mapped[str]
|
||||||
|
token_start: Mapped[int]
|
||||||
|
token_count: Mapped[int]
|
||||||
|
page_label: Mapped[str | None]
|
||||||
|
content_sha256: Mapped[str] = mapped_column(String(64))
|
||||||
|
search_text: Mapped[str]
|
||||||
|
|
||||||
|
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chunks")
|
||||||
|
chapter: Mapped[EbookChapter | None] = relationship("EbookChapter", back_populates="chunks")
|
||||||
|
|
||||||
|
|
||||||
|
class EbookEmbeddingModel(TableBase):
|
||||||
|
"""A supported embedding model."""
|
||||||
|
|
||||||
|
__tablename__ = "ebook_embedding_model"
|
||||||
|
|
||||||
|
name: Mapped[str] = mapped_column(String, unique=True)
|
||||||
|
dimension: Mapped[int]
|
||||||
|
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||||
|
|
||||||
|
|
||||||
|
class EbookChunkEmbedding1024(TableBaseBig):
|
||||||
|
"""1024-dimensional chunk embedding."""
|
||||||
|
|
||||||
|
__tablename__ = "ebook_chunk_embedding_1024"
|
||||||
|
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
||||||
|
|
||||||
|
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
||||||
|
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
||||||
|
embedding: Mapped[list[float]] = mapped_column(Vector(1024))
|
||||||
|
|
||||||
|
|
||||||
|
class EbookChunkEmbedding2560(TableBaseBig):
|
||||||
|
"""2560-dimensional chunk embedding."""
|
||||||
|
|
||||||
|
__tablename__ = "ebook_chunk_embedding_2560"
|
||||||
|
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
||||||
|
|
||||||
|
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
||||||
|
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
||||||
|
embedding: Mapped[list[float]] = mapped_column(Vector(2560))
|
||||||
|
|
||||||
|
|
||||||
|
class EbookChunkEmbedding4096(TableBaseBig):
|
||||||
|
"""4096-dimensional chunk embedding."""
|
||||||
|
|
||||||
|
__tablename__ = "ebook_chunk_embedding_4096"
|
||||||
|
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
||||||
|
|
||||||
|
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
||||||
|
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
||||||
|
embedding: Mapped[list[float]] = mapped_column(Vector(4096))
|
||||||
Reference in New Issue
Block a user