"""EPUB parsing helpers.""" from __future__ import annotations import re from dataclasses import dataclass from typing import TYPE_CHECKING from bs4 import BeautifulSoup from ebooklib import ITEM_DOCUMENT, epub if TYPE_CHECKING: from pathlib import Path WHITESPACE_RE = re.compile(r"\s+") @dataclass(frozen=True) class ParsedChapter: """Text extracted from one EPUB spine document.""" title: str | None href: str | None text: str page_labels: tuple[str, ...] @dataclass(frozen=True) class ParsedEpub: """Parsed EPUB metadata and text.""" title: str author: str | None language: str | None publisher: str | None identifier: str | None chapters: tuple[ParsedChapter, ...] def parse_epub(path: Path) -> ParsedEpub: """Parse EPUB metadata and spine text.""" book = epub.read_epub(path) chapters = [] for item in book.get_items_of_type(ITEM_DOCUMENT): soup = BeautifulSoup(item.get_content(), "html.parser") title = chapter_title(soup) page_labels = tuple(extract_page_labels(soup)) text = clean_text(soup.get_text(" ")) if text: chapters.append(ParsedChapter(title=title, href=item.get_name(), text=text, page_labels=page_labels)) return ParsedEpub( title=metadata_value(book, "title") or path.stem, author=metadata_value(book, "creator"), language=metadata_value(book, "language"), publisher=metadata_value(book, "publisher"), identifier=metadata_value(book, "identifier"), chapters=tuple(chapters), ) def metadata_value(book: epub.EpubBook, name: str) -> str | None: """Return the first non-empty Dublin Core metadata value for a name.""" values = book.get_metadata("DC", name) if not values: return None value = values[0][0] return str(value).strip() or None def chapter_title(soup: BeautifulSoup) -> str | None: """Extract the best available title from an EPUB document soup.""" heading = soup.find(["h1", "h2", "h3"]) if heading is None: title = soup.find("title") if title is None: return None return clean_text(title.get_text(" ")) or None return clean_text(heading.get_text(" ")) or None def extract_page_labels(soup: BeautifulSoup) -> list[str]: """Extract EPUB page-break labels from a document soup.""" labels: list[str] = [] for tag in soup.find_all(attrs={"epub:type": "pagebreak"}): label = tag.get("title") or tag.get("aria-label") or tag.get_text(" ") clean = clean_text(str(label)) if clean: labels.append(clean) return labels def clean_text(text: str) -> str: """Normalize whitespace in extracted EPUB text.""" return WHITESPACE_RE.sub(" ", text).strip()