added ebook embedding to orm
This commit is contained in:
@@ -11,6 +11,15 @@ from python.orm.richie.contact import (
|
||||
Need,
|
||||
RelationshipType,
|
||||
)
|
||||
from python.orm.richie.ebook import (
|
||||
EbookChapter,
|
||||
EbookChunk,
|
||||
EbookChunkEmbedding1024,
|
||||
EbookChunkEmbedding2560,
|
||||
EbookChunkEmbedding4096,
|
||||
EbookEmbeddingModel,
|
||||
EbookSource,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Audiobook",
|
||||
@@ -19,6 +28,13 @@ __all__ = [
|
||||
"Contact",
|
||||
"ContactNeed",
|
||||
"ContactRelationship",
|
||||
"EbookChapter",
|
||||
"EbookChunk",
|
||||
"EbookChunkEmbedding1024",
|
||||
"EbookChunkEmbedding2560",
|
||||
"EbookChunkEmbedding4096",
|
||||
"EbookEmbeddingModel",
|
||||
"EbookSource",
|
||||
"Need",
|
||||
"RelationshipType",
|
||||
"RichieBase",
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
"""EPUB search models."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, String, UniqueConstraint
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from python.orm.richie.base import TableBase, TableBaseBig
|
||||
|
||||
|
||||
class EbookSource(TableBase):
|
||||
"""One indexed EPUB file."""
|
||||
|
||||
__tablename__ = "ebook_source"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("file_path"),
|
||||
UniqueConstraint("file_sha256"),
|
||||
)
|
||||
|
||||
title: Mapped[str]
|
||||
author: Mapped[str | None]
|
||||
language: Mapped[str | None]
|
||||
publisher: Mapped[str | None]
|
||||
identifier: Mapped[str | None]
|
||||
file_path: Mapped[str]
|
||||
file_sha256: Mapped[str] = mapped_column(String(64))
|
||||
file_mtime: Mapped[datetime] = mapped_column(DateTime(timezone=True))
|
||||
file_size: Mapped[int] = mapped_column(BigInteger)
|
||||
|
||||
chapters: Mapped[list[EbookChapter]] = relationship(
|
||||
"EbookChapter",
|
||||
back_populates="source",
|
||||
cascade="all, delete-orphan",
|
||||
passive_deletes=True,
|
||||
)
|
||||
chunks: Mapped[list[EbookChunk]] = relationship(
|
||||
"EbookChunk",
|
||||
back_populates="source",
|
||||
cascade="all, delete-orphan",
|
||||
passive_deletes=True,
|
||||
)
|
||||
|
||||
|
||||
class EbookChapter(TableBase):
|
||||
"""A chapter or spine document inside an EPUB."""
|
||||
|
||||
__tablename__ = "ebook_chapter"
|
||||
__table_args__ = (UniqueConstraint("source_id", "spine_index"),)
|
||||
|
||||
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
|
||||
spine_index: Mapped[int]
|
||||
title: Mapped[str | None]
|
||||
href: Mapped[str | None]
|
||||
|
||||
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chapters")
|
||||
chunks: Mapped[list[EbookChunk]] = relationship(
|
||||
"EbookChunk",
|
||||
back_populates="chapter",
|
||||
cascade="all, delete-orphan",
|
||||
passive_deletes=True,
|
||||
)
|
||||
|
||||
|
||||
class EbookChunk(TableBaseBig):
|
||||
"""A searchable text chunk."""
|
||||
|
||||
__tablename__ = "ebook_chunk"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"),
|
||||
UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"),
|
||||
)
|
||||
|
||||
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
|
||||
chapter_id: Mapped[int | None] = mapped_column(ForeignKey("main.ebook_chapter.id", ondelete="SET NULL"))
|
||||
chunk_index: Mapped[int]
|
||||
text: Mapped[str]
|
||||
token_start: Mapped[int]
|
||||
token_count: Mapped[int]
|
||||
page_label: Mapped[str | None]
|
||||
content_sha256: Mapped[str] = mapped_column(String(64))
|
||||
search_text: Mapped[str]
|
||||
|
||||
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chunks")
|
||||
chapter: Mapped[EbookChapter | None] = relationship("EbookChapter", back_populates="chunks")
|
||||
|
||||
|
||||
class EbookEmbeddingModel(TableBase):
|
||||
"""A supported embedding model."""
|
||||
|
||||
__tablename__ = "ebook_embedding_model"
|
||||
|
||||
name: Mapped[str] = mapped_column(String, unique=True)
|
||||
dimension: Mapped[int]
|
||||
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
|
||||
|
||||
class EbookChunkEmbedding1024(TableBaseBig):
|
||||
"""1024-dimensional chunk embedding."""
|
||||
|
||||
__tablename__ = "ebook_chunk_embedding_1024"
|
||||
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
||||
|
||||
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
||||
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
||||
embedding: Mapped[list[float]] = mapped_column(Vector(1024))
|
||||
|
||||
|
||||
class EbookChunkEmbedding2560(TableBaseBig):
|
||||
"""2560-dimensional chunk embedding."""
|
||||
|
||||
__tablename__ = "ebook_chunk_embedding_2560"
|
||||
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
||||
|
||||
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
||||
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
||||
embedding: Mapped[list[float]] = mapped_column(Vector(2560))
|
||||
|
||||
|
||||
class EbookChunkEmbedding4096(TableBaseBig):
|
||||
"""4096-dimensional chunk embedding."""
|
||||
|
||||
__tablename__ = "ebook_chunk_embedding_4096"
|
||||
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
|
||||
|
||||
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
|
||||
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
|
||||
embedding: Mapped[list[float]] = mapped_column(Vector(4096))
|
||||
Reference in New Issue
Block a user