Files
dotfiles/python/orm/richie/ebook.py
T
2026-06-13 22:17:46 -04:00

131 lines
4.5 KiB
Python

"""EPUB search models."""
from __future__ import annotations
from datetime import datetime
from pgvector.sqlalchemy import Vector
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, String, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.richie.base import TableBase, TableBaseBig
class EbookSource(TableBase):
"""One indexed EPUB file."""
__tablename__ = "ebook_source"
__table_args__ = (
UniqueConstraint("file_path"),
UniqueConstraint("file_sha256"),
)
title: Mapped[str]
author: Mapped[str | None]
language: Mapped[str | None]
publisher: Mapped[str | None]
identifier: Mapped[str | None]
file_path: Mapped[str]
file_sha256: Mapped[str] = mapped_column(String(64))
file_mtime: Mapped[datetime] = mapped_column(DateTime(timezone=True))
file_size: Mapped[int] = mapped_column(BigInteger)
chapters: Mapped[list[EbookChapter]] = relationship(
"EbookChapter",
back_populates="source",
cascade="all, delete-orphan",
passive_deletes=True,
)
chunks: Mapped[list[EbookChunk]] = relationship(
"EbookChunk",
back_populates="source",
cascade="all, delete-orphan",
passive_deletes=True,
)
class EbookChapter(TableBase):
"""A chapter or spine document inside an EPUB."""
__tablename__ = "ebook_chapter"
__table_args__ = (UniqueConstraint("source_id", "spine_index"),)
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
spine_index: Mapped[int]
title: Mapped[str | None]
href: Mapped[str | None]
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chapters")
chunks: Mapped[list[EbookChunk]] = relationship(
"EbookChunk",
back_populates="chapter",
cascade="all, delete-orphan",
passive_deletes=True,
)
class EbookChunk(TableBaseBig):
"""A searchable text chunk."""
__tablename__ = "ebook_chunk"
__table_args__ = (
UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"),
UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"),
)
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
chapter_id: Mapped[int | None] = mapped_column(ForeignKey("main.ebook_chapter.id", ondelete="SET NULL"))
chunk_index: Mapped[int]
text: Mapped[str]
token_start: Mapped[int]
token_count: Mapped[int]
page_label: Mapped[str | None]
content_sha256: Mapped[str] = mapped_column(String(64))
search_text: Mapped[str]
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chunks")
chapter: Mapped[EbookChapter | None] = relationship("EbookChapter", back_populates="chunks")
class EbookEmbeddingModel(TableBase):
"""A supported embedding model."""
__tablename__ = "ebook_embedding_model"
name: Mapped[str] = mapped_column(String, unique=True)
dimension: Mapped[int]
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
class EbookChunkEmbedding1024(TableBaseBig):
"""1024-dimensional chunk embedding."""
__tablename__ = "ebook_chunk_embedding_1024"
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
embedding: Mapped[list[float]] = mapped_column(Vector(1024))
class EbookChunkEmbedding2560(TableBaseBig):
"""2560-dimensional chunk embedding."""
__tablename__ = "ebook_chunk_embedding_2560"
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
embedding: Mapped[list[float]] = mapped_column(Vector(2560))
class EbookChunkEmbedding4096(TableBaseBig):
"""4096-dimensional chunk embedding."""
__tablename__ = "ebook_chunk_embedding_4096"
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
embedding: Mapped[list[float]] = mapped_column(Vector(4096))