built ingest

2026-06-12 03:06:16 -04:00
parent d740b25b2c
commit c5293b0dcf
2 changed files with 285 additions and 0 deletions
@@ -0,0 +1,95 @@
 """EPUB parsing helpers."""
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from bs4 import BeautifulSoup
 from ebooklib import ITEM_DOCUMENT, epub
 if TYPE_CHECKING:
    from pathlib import Path
 WHITESPACE_RE = re.compile(r"\s+")
@dataclass(frozen=True)
 class ParsedChapter:
    """Text extracted from one EPUB spine document."""
    title: str | None
    href: str | None
    text: str
    page_labels: tuple[str, ...]
@dataclass(frozen=True)
 class ParsedEpub:
    """Parsed EPUB metadata and text."""
    title: str
    author: str | None
    language: str | None
    publisher: str | None
    identifier: str | None
    chapters: tuple[ParsedChapter, ...]
 def parse_epub(path: Path) -> ParsedEpub:
    """Parse EPUB metadata and spine text."""
    book = epub.read_epub(path)
    chapters = []
    for item in book.get_items_of_type(ITEM_DOCUMENT):
        soup = BeautifulSoup(item.get_content(), "html.parser")
        title = chapter_title(soup)
        page_labels = tuple(extract_page_labels(soup))
        text = clean_text(soup.get_text(" "))
        if text:
            chapters.append(ParsedChapter(title=title, href=item.get_name(), text=text, page_labels=page_labels))
    return ParsedEpub(
        title=metadata_value(book, "title") or path.stem,
        author=metadata_value(book, "creator"),
        language=metadata_value(book, "language"),
        publisher=metadata_value(book, "publisher"),
        identifier=metadata_value(book, "identifier"),
        chapters=tuple(chapters),
    )
 def metadata_value(book: epub.EpubBook, name: str) -> str | None:
    """Return the first non-empty Dublin Core metadata value for a name."""
    values = book.get_metadata("DC", name)
    if not values:
        return None
    value = values[0][0]
    return str(value).strip() or None
 def chapter_title(soup: BeautifulSoup) -> str | None:
    """Extract the best available title from an EPUB document soup."""
    heading = soup.find(["h1", "h2", "h3"])
    if heading is None:
        title = soup.find("title")
        if title is None:
            return None
        return clean_text(title.get_text(" ")) or None
    return clean_text(heading.get_text(" ")) or None
 def extract_page_labels(soup: BeautifulSoup) -> list[str]:
    """Extract EPUB page-break labels from a document soup."""
    labels: list[str] = []
    for tag in soup.find_all(attrs={"epub:type": "pagebreak"}):
        label = tag.get("title") or tag.get("aria-label") or tag.get_text(" ")
        clean = clean_text(str(label))
        if clean:
            labels.append(clean)
    return labels
 def clean_text(text: str) -> str:
    """Normalize whitespace in extracted EPUB text."""
    return WHITESPACE_RE.sub(" ", text).strip()
@@ -0,0 +1,190 @@
 """EPUB ingestion into Richie DB."""
 from __future__ import annotations
 import hashlib
 import logging
 from dataclasses import dataclass
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import TYPE_CHECKING
 import tiktoken
 from sqlalchemy import or_, select
 from python.ebook_search.epub_parse import parse_epub
 from python.orm.richie import EbookChapter, EbookChunk, EbookSource
 logger = logging.getLogger(__name__)
 DEFAULT_CHUNK_TOKENS = 700
 DEFAULT_CHUNK_OVERLAP = 100
 if TYPE_CHECKING:
    from sqlalchemy.orm import Session
    from python.ebook_search.config import EbookSearchConfig
    from python.ebook_search.epub_parse import ParsedChapter
@dataclass(frozen=True)
 class TextChunk:
    """A token-bounded chunk of text."""
    text: str
    token_start: int
    token_count: int
 def chunk_text(
    text: str,
    *,
    chunk_tokens: int = DEFAULT_CHUNK_TOKENS,
    overlap_tokens: int = DEFAULT_CHUNK_OVERLAP,
 ) -> list[TextChunk]:
    """Split text into overlapping token chunks."""
    if chunk_tokens <= 0:
        msg = "chunk_tokens must be positive"
        raise ValueError(msg)
    if overlap_tokens < 0 or overlap_tokens >= chunk_tokens:
        msg = "overlap_tokens must be non-negative and smaller than chunk_tokens"
        raise ValueError(msg)
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    if not tokens:
        return []
    chunks: list[TextChunk] = []
    step = chunk_tokens - overlap_tokens
    for start in range(0, len(tokens), step):
        chunk = tokens[start : start + chunk_tokens]
        if not chunk:
            continue
        chunks.append(
            TextChunk(
                text=encoding.decode(chunk).strip(),
                token_start=start,
                token_count=len(chunk),
            )
        )
        if start + chunk_tokens >= len(tokens):
            break
    return [chunk for chunk in chunks if chunk.text]
 def ingest_configured_paths(session: Session, config: EbookSearchConfig) -> int:
    """Ingest every EPUB found under configured library paths."""
    count = 0
    for library_path in config.library_paths:
        path = Path(library_path).expanduser()
        logger.info("ebook_ingest_path_start path=%s", path)
        if path.is_file() and path.suffix.lower() == ".epub":
            count += int(ingest_file(session, path))
        elif path.is_dir():
            for epub_path in sorted(path.rglob("*.epub")):
                count += int(ingest_file(session, epub_path))
        else:
            logger.warning("ebook_ingest_path_missing path=%s", path)
    logger.info("ebook_ingest_paths_complete changed_files=%s configured_paths=%s", count, len(config.library_paths))
    return count
 def ingest_file(session: Session, path: Path) -> bool:
    """Ingest one EPUB file. Return True when the database changed."""
    resolved_path = path.expanduser().resolve()
    logger.info("ebook_ingest_file_start path=%s", resolved_path)
    file_hash = sha256_file(resolved_path)
    existing = find_existing_source(session, resolved_path, file_hash)
    if existing is not None and existing.file_sha256 == file_hash:
        stat = resolved_path.stat()
        existing.file_path = str(resolved_path)
        existing.file_mtime = datetime.fromtimestamp(stat.st_mtime, tz=UTC)
        existing.file_size = stat.st_size
        session.flush()
        logger.info("ebook_ingest_file_unchanged source_id=%s path=%s", existing.id, resolved_path)
        return False
    if existing is not None:
        logger.info("ebook_ingest_file_replacing source_id=%s path=%s", existing.id, resolved_path)
        session.delete(existing)
        session.flush()
    stat = resolved_path.stat()
    parsed = parse_epub(resolved_path)
    source = EbookSource(
        title=parsed.title,
        author=parsed.author,
        language=parsed.language,
        publisher=parsed.publisher,
        identifier=parsed.identifier,
        file_path=str(resolved_path),
        file_sha256=file_hash,
        file_mtime=datetime.fromtimestamp(stat.st_mtime, tz=UTC),
        file_size=stat.st_size,
    )
    session.add(source)
    session.flush()
    chunk_index = 0
    for spine_index, parsed_chapter in enumerate(parsed.chapters):
        chapter = EbookChapter(
            source_id=source.id,
            spine_index=spine_index,
            title=parsed_chapter.title,
            href=parsed_chapter.href,
        )
        session.add(chapter)
        session.flush()
        chunk_index = add_chapter_chunks(session, source, chapter, parsed_chapter, chunk_index)
    session.flush()
    logger.info(
        "ebook_ingest_file_complete source_id=%s path=%s chapters=%s chunks=%s",
        source.id,
        resolved_path,
        len(parsed.chapters),
        chunk_index,
    )
    return True
 def find_existing_source(session: Session, path: Path, file_hash: str) -> EbookSource | None:
    """Find an existing source by canonical path or file hash."""
    return session.scalar(
        select(EbookSource).where(or_(EbookSource.file_path == str(path), EbookSource.file_sha256 == file_hash))
    )
 def add_chapter_chunks(
    session: Session,
    source: EbookSource,
    chapter: EbookChapter,
    parsed_chapter: ParsedChapter,
    chunk_index: int,
 ) -> int:
    """Add chunk rows for one parsed chapter and return the next chunk index."""
    page_label = parsed_chapter.page_labels[0] if parsed_chapter.page_labels else None
    for text_chunk in chunk_text(parsed_chapter.text):
        session.add(
            EbookChunk(
                source_id=source.id,
                chapter_id=chapter.id,
                chunk_index=chunk_index,
                text=text_chunk.text,
                token_start=text_chunk.token_start,
                token_count=text_chunk.token_count,
                page_label=page_label,
                content_sha256=hashlib.sha256(text_chunk.text.encode()).hexdigest(),
                search_text=f"{source.title} {source.author or ''} {chapter.title or ''} {text_chunk.text}",
            )
        )
        chunk_index += 1
    return chunk_index
 def sha256_file(path: Path) -> str:
    """Calculate the SHA-256 digest for a file."""
    digest = hashlib.sha256()
    with path.open("rb") as file:
        for block in iter(lambda: file.read(1024 * 1024), b""):
            digest.update(block)
    return digest.hexdigest()