built ingest
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
"""EPUB parsing helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import ITEM_DOCUMENT, epub
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
WHITESPACE_RE = re.compile(r"\s+")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParsedChapter:
|
||||
"""Text extracted from one EPUB spine document."""
|
||||
|
||||
title: str | None
|
||||
href: str | None
|
||||
text: str
|
||||
page_labels: tuple[str, ...]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParsedEpub:
|
||||
"""Parsed EPUB metadata and text."""
|
||||
|
||||
title: str
|
||||
author: str | None
|
||||
language: str | None
|
||||
publisher: str | None
|
||||
identifier: str | None
|
||||
chapters: tuple[ParsedChapter, ...]
|
||||
|
||||
|
||||
def parse_epub(path: Path) -> ParsedEpub:
|
||||
"""Parse EPUB metadata and spine text."""
|
||||
book = epub.read_epub(path)
|
||||
chapters = []
|
||||
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
||||
soup = BeautifulSoup(item.get_content(), "html.parser")
|
||||
title = chapter_title(soup)
|
||||
page_labels = tuple(extract_page_labels(soup))
|
||||
text = clean_text(soup.get_text(" "))
|
||||
if text:
|
||||
chapters.append(ParsedChapter(title=title, href=item.get_name(), text=text, page_labels=page_labels))
|
||||
|
||||
return ParsedEpub(
|
||||
title=metadata_value(book, "title") or path.stem,
|
||||
author=metadata_value(book, "creator"),
|
||||
language=metadata_value(book, "language"),
|
||||
publisher=metadata_value(book, "publisher"),
|
||||
identifier=metadata_value(book, "identifier"),
|
||||
chapters=tuple(chapters),
|
||||
)
|
||||
|
||||
|
||||
def metadata_value(book: epub.EpubBook, name: str) -> str | None:
|
||||
"""Return the first non-empty Dublin Core metadata value for a name."""
|
||||
values = book.get_metadata("DC", name)
|
||||
if not values:
|
||||
return None
|
||||
value = values[0][0]
|
||||
return str(value).strip() or None
|
||||
|
||||
|
||||
def chapter_title(soup: BeautifulSoup) -> str | None:
|
||||
"""Extract the best available title from an EPUB document soup."""
|
||||
heading = soup.find(["h1", "h2", "h3"])
|
||||
if heading is None:
|
||||
title = soup.find("title")
|
||||
if title is None:
|
||||
return None
|
||||
return clean_text(title.get_text(" ")) or None
|
||||
return clean_text(heading.get_text(" ")) or None
|
||||
|
||||
|
||||
def extract_page_labels(soup: BeautifulSoup) -> list[str]:
|
||||
"""Extract EPUB page-break labels from a document soup."""
|
||||
labels: list[str] = []
|
||||
for tag in soup.find_all(attrs={"epub:type": "pagebreak"}):
|
||||
label = tag.get("title") or tag.get("aria-label") or tag.get_text(" ")
|
||||
clean = clean_text(str(label))
|
||||
if clean:
|
||||
labels.append(clean)
|
||||
return labels
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Normalize whitespace in extracted EPUB text."""
|
||||
return WHITESPACE_RE.sub(" ", text).strip()
|
||||
Reference in New Issue
Block a user