131 lines
4.5 KiB
Python
131 lines
4.5 KiB
Python
"""EPUB search models."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
|
|
from pgvector.sqlalchemy import Vector
|
|
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, String, UniqueConstraint
|
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
|
|
from python.orm.richie.base import TableBase, TableBaseBig
|
|
|
|
|
|
class EbookSource(TableBase):
|
|
"""One indexed EPUB file."""
|
|
|
|
__tablename__ = "ebook_source"
|
|
__table_args__ = (
|
|
UniqueConstraint("file_path"),
|
|
UniqueConstraint("file_sha256"),
|
|
)
|
|
|
|
title: Mapped[str]
|
|
author: Mapped[str | None]
|
|
language: Mapped[str | None]
|
|
publisher: Mapped[str | None]
|
|
identifier: Mapped[str | None]
|
|
file_path: Mapped[str]
|
|
file_sha256: Mapped[str] = mapped_column(String(64))
|
|
file_mtime: Mapped[datetime] = mapped_column(DateTime(timezone=True))
|
|
file_size: Mapped[int] = mapped_column(BigInteger)
|
|
|
|
chapters: Mapped[list[EbookChapter]] = relationship(
|
|
"EbookChapter",
|
|
back_populates="source",
|
|
cascade="all, delete-orphan",
|
|
passive_deletes=True,
|
|
)
|
|
chunks: Mapped[list[EbookChunk]] = relationship(
|
|
"EbookChunk",
|
|
back_populates="source",
|
|
cascade="all, delete-orphan",
|
|
passive_deletes=True,
|
|
)
|
|
|
|
|
|
class EbookChapter(TableBase):
|
|
"""A chapter or spine document inside an EPUB."""
|
|
|
|
__tablename__ = "ebook_chapter"
|
|
__table_args__ = (UniqueConstraint("source_id", "spine_index"),)
|
|
|
|
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
|
|
spine_index: Mapped[int]
|
|
title: Mapped[str | None]
|
|
href: Mapped[str | None]
|
|
|
|
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chapters")
|
|
chunks: Mapped[list[EbookChunk]] = relationship(
|
|
"EbookChunk",
|
|
back_populates="chapter",
|
|
cascade="all, delete-orphan",
|
|
passive_deletes=True,
|
|
)
|
|
|
|
|
|
class EbookChunk(TableBaseBig):
|
|
"""A searchable text chunk."""
|
|
|
|
__tablename__ = "ebook_chunk"
|
|
__table_args__ = (
|
|
UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"),
|
|
UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"),
|
|
)
|
|
|
|
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
|
|
chapter_id: Mapped[int | None] = mapped_column(ForeignKey("main.ebook_chapter.id", ondelete="SET NULL"))
|
|
chunk_index: Mapped[int]
|
|
text: Mapped[str]
|
|
token_start: Mapped[int]
|
|
token_count: Mapped[int]
|
|
page_label: Mapped[str | None]
|
|
content_sha256: Mapped[str] = mapped_column(String(64))
|
|
search_text: Mapped[str]
|
|
|
|
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chunks")
|
|
chapter: Mapped[EbookChapter | None] = relationship("EbookChapter", back_populates="chunks")
|
|
|
|
|
|
class EbookEmbeddingModel(TableBase):
|
|
"""A supported embedding model."""
|
|
|
|
__tablename__ = "ebook_embedding_model"
|
|
|
|
name: Mapped[str] = mapped_column(String, unique=True)
|
|
dimension: Mapped[int]
|
|
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
|
|
|
|
class EbookChunkEmbedding1024(TableBaseBig):
|
|
"""1024-dimensional chunk embedding."""
|
|
|
|
__tablename__ = "ebook_chunk_embedding_1024"
|
|
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
|
|
|
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
|
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
|
embedding: Mapped[list[float]] = mapped_column(Vector(1024))
|
|
|
|
|
|
class EbookChunkEmbedding2560(TableBaseBig):
|
|
"""2560-dimensional chunk embedding."""
|
|
|
|
__tablename__ = "ebook_chunk_embedding_2560"
|
|
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
|
|
|
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
|
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
|
embedding: Mapped[list[float]] = mapped_column(Vector(2560))
|
|
|
|
|
|
class EbookChunkEmbedding4096(TableBaseBig):
|
|
"""4096-dimensional chunk embedding."""
|
|
|
|
__tablename__ = "ebook_chunk_embedding_4096"
|
|
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
|
|
|
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
|
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
|
embedding: Mapped[list[float]] = mapped_column(Vector(4096))
|