From 9290cb46ee786b76ce497da8aa186ec5ced44699 Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Wed, 10 Jun 2026 20:07:27 -0400 Subject: [PATCH] updated series_index to float and added UniqueConstraint to audiobook and audiobook_author --- ..._index_to_float_and_added__b3c60cc5beb5.py | 63 ++++++++ python/orm/richie/audiobook.py | 11 +- python/tools/audiobook/audible_convert.py | 29 +++- python/tools/audiobook/llm_tool_calling.py | 54 +++++-- python/tools/audiobook/metadata_agent.py | 17 +- tests/test_audible_convert.py | 153 +++++++++++++++++- 6 files changed, 307 insertions(+), 20 deletions(-) create mode 100644 python/alembic/richie/versions/2026_06_10-updated_series_index_to_float_and_added__b3c60cc5beb5.py diff --git a/python/alembic/richie/versions/2026_06_10-updated_series_index_to_float_and_added__b3c60cc5beb5.py b/python/alembic/richie/versions/2026_06_10-updated_series_index_to_float_and_added__b3c60cc5beb5.py new file mode 100644 index 0000000..6c8d67e --- /dev/null +++ b/python/alembic/richie/versions/2026_06_10-updated_series_index_to_float_and_added__b3c60cc5beb5.py @@ -0,0 +1,63 @@ +"""updated series_index to float and added UniqueConstraint to audiobook and audiobook_author. + +Revision ID: b3c60cc5beb5 +Revises: d7864d1ffc17 +Create Date: 2026-06-10 20:02:43.073725 + +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import sqlalchemy as sa +from alembic import op + +from python.orm import RichieBase + +if TYPE_CHECKING: + from collections.abc import Sequence + +# revision identifiers, used by Alembic. +revision: str = "b3c60cc5beb5" +down_revision: str | None = "d7864d1ffc17" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +schema = RichieBase.schema_name + + +def upgrade() -> None: + """Upgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "audiobook", + "series_index", + existing_type=sa.INTEGER(), + type_=sa.Float(), + existing_nullable=False, + schema=schema, + ) + op.create_unique_constraint( + op.f("uq_audiobook_author_id"), + "audiobook", + ["author_id", "series_id", "title"], + schema=schema, + postgresql_nulls_not_distinct=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(op.f("uq_audiobook_author_id"), "audiobook", schema=schema, type_="unique") + op.alter_column( + "audiobook", + "series_index", + existing_type=sa.Float(), + type_=sa.INTEGER(), + existing_nullable=False, + schema=schema, + ) + # ### end Alembic commands ### diff --git a/python/orm/richie/audiobook.py b/python/orm/richie/audiobook.py index 8ab78ea..0b8d6d4 100644 --- a/python/orm/richie/audiobook.py +++ b/python/orm/richie/audiobook.py @@ -12,6 +12,7 @@ class AudiobookAuthor(TableBase): """Canonical audiobook author.""" __tablename__ = "audiobook_author" + __table_args__ = (UniqueConstraint("name"),) name: Mapped[str] = mapped_column(String, unique=True) @@ -36,11 +37,19 @@ class Audiobook(TableBase): """Canonical audiobook title.""" __tablename__ = "audiobook" + __table_args__ = ( + UniqueConstraint( + "author_id", + "series_id", + "title", + postgresql_nulls_not_distinct=True, + ), + ) title: Mapped[str] = mapped_column(String) author_id: Mapped[int] = mapped_column(ForeignKey("main.audiobook_author.id", ondelete="CASCADE")) series_id: Mapped[int | None] = mapped_column(ForeignKey("main.audiobook_series.id", ondelete="SET NULL")) - series_index: Mapped[int] = mapped_column(default=0) + series_index: Mapped[float] = mapped_column(default=0.0) author: Mapped[AudiobookAuthor] = relationship("AudiobookAuthor", back_populates="books") series: Mapped[AudiobookSeries | None] = relationship("AudiobookSeries", back_populates="books") diff --git a/python/tools/audiobook/audible_convert.py b/python/tools/audiobook/audible_convert.py index 4d38c3a..4957b04 100644 --- a/python/tools/audiobook/audible_convert.py +++ b/python/tools/audiobook/audible_convert.py @@ -4,6 +4,7 @@ from __future__ import annotations import json import logging +import re import shutil import subprocess from concurrent.futures import ThreadPoolExecutor @@ -30,6 +31,7 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) SENSITIVE_COMMAND_ARGUMENTS = {"-activation_bytes"} +BOOK_RANGE_PATTERN = re.compile(r"(?:^|-)books?-(?P[1-9]\d*)-(?P[1-9]\d*)(?:-|$)") @dataclass(frozen=True) @@ -178,7 +180,32 @@ def output_stem(metadata: StandardBookMetadata) -> str: Returns: Output stem in author-series_01-title form. """ - return f"{metadata.author}-{metadata.series}_{metadata.series_index:02}-{metadata.title}" + index_slug = series_index_slug(metadata.series_index, metadata.title) + return f"{metadata.author}-{metadata.series}_{index_slug}-{metadata.title}" + + +def series_index_slug(series_index: float, title: str = "") -> str: + """Return a filename-safe series index.""" + if title_range := title_series_range_slug(series_index, title): + return title_range + index = float(series_index) + if index.is_integer(): + return f"{int(index):02}" + return f"{int(index):02}.5" + + +def title_series_range_slug(series_index: float, title: str) -> str | None: + """Return a series range slug found in an omnibus title.""" + index = float(series_index) + if not index.is_integer(): + return None + first_index = int(index) + for match in BOOK_RANGE_PATTERN.finditer(title): + start = int(match.group("start")) + end = int(match.group("end")) + if start == first_index and end > start: + return f"{start:02}-{end:02}" + return None def metadata_output_path(output_directory: Path, metadata: StandardBookMetadata) -> Path: diff --git a/python/tools/audiobook/llm_tool_calling.py b/python/tools/audiobook/llm_tool_calling.py index 03aebb3..fd790c0 100644 --- a/python/tools/audiobook/llm_tool_calling.py +++ b/python/tools/audiobook/llm_tool_calling.py @@ -144,7 +144,7 @@ class CatalogToolRegistry: "title": {"type": "string"}, "author_id": {"type": "integer"}, "series_id": {"type": ["integer", "null"]}, - "series_index": {"type": "integer"}, + "series_index": {"type": "number", "multipleOf": 0.5}, }, "required": ["title", "author_id", "series_id", "series_index"], }, @@ -306,12 +306,7 @@ class CatalogToolRegistry: author_id = required_int(arguments, "author_id") validate_catalog_slug(name, "series") author = self.required_author(author_id) - series = self.session.scalar( - select(AudiobookSeries).where( - AudiobookSeries.name == name, - AudiobookSeries.author_id == author.id, - ), - ) + series = self.find_series_by_catalog_slug(name, author.id) action = "existing" if series is None: series = AudiobookSeries(name=name, author=author) @@ -329,7 +324,7 @@ class CatalogToolRegistry: title = required_string(arguments, "title") author_id = required_int(arguments, "author_id") series_id = optional_int(arguments.get("series_id"), "series_id") - series_index = required_int(arguments, "series_index") + series_index = required_series_index(arguments, "series_index") ensured = self.ensure_book(title, author_id, series_id, series_index) return [self.book_result(ensured.book, ensured.action)] @@ -338,7 +333,7 @@ class CatalogToolRegistry: title: str, author_id: int, series_id: int | None, - series_index: int, + series_index: float, ) -> EnsuredBook: """Return an existing book row, or create it after validating ownership.""" title = normalize_title_slug(title) @@ -398,6 +393,26 @@ class CatalogToolRegistry: raise MetadataResolutionError(msg) return series + def find_series_by_catalog_slug(self, name: str, author_id: int) -> AudiobookSeries | None: + """Return a series by exact slug or underscore-insensitive slug.""" + exact = self.session.scalar( + select(AudiobookSeries).where( + AudiobookSeries.name == name, + AudiobookSeries.author_id == author_id, + ), + ) + if exact is not None: + return exact + + compact_name = compact_catalog_slug(name) + series_rows = self.session.scalars( + select(AudiobookSeries).where(AudiobookSeries.author_id == author_id).order_by(AudiobookSeries.name), + ).all() + for series in series_rows: + if compact_catalog_slug(series.name) == compact_name: + return series + return None + def series_result(self, series: AudiobookSeries, action: str) -> dict[str, object]: """Build a normalized series tool result.""" return { @@ -513,6 +528,11 @@ def normalize_catalog_slug(value: str) -> str: return re.sub(r"[^a-z0-9]+", "_", value.strip().casefold()).strip("_") +def compact_catalog_slug(value: str) -> str: + """Return a catalog slug comparison key that ignores underscores.""" + return normalize_catalog_slug(value).replace("_", "") + + def normalize_title_slug(value: str) -> str: """Normalize noisy book titles into lower kebab-case slugs.""" return re.sub(r"[^a-z0-9]+", "-", value.strip().casefold()).strip("-") @@ -533,8 +553,9 @@ def query_terms(query: str) -> tuple[str, ...]: """Return text variants useful for matching noisy audiobook metadata.""" normalized = query.strip().casefold() underscore_slug = normalize_catalog_slug(normalized) + compact_slug = compact_catalog_slug(normalized) hyphen_slug = normalize_title_slug(normalized) - return tuple(dict.fromkeys(term for term in (normalized, underscore_slug, hyphen_slug) if term)) + return tuple(dict.fromkeys(term for term in (normalized, underscore_slug, compact_slug, hyphen_slug) if term)) def required_string(data: dict[str, object], key: str) -> str: @@ -555,6 +576,19 @@ def required_int(data: dict[str, object], key: str) -> int: return value +def required_series_index(data: dict[str, object], key: str) -> float: + """Read a required whole-number or half-number series index.""" + value = data.get(key) + if isinstance(value, bool) or not isinstance(value, int | float): + msg = f"{key} must be a number" + raise MetadataResolutionError(msg) + series_index = float(value) + if not (series_index * 2).is_integer(): + msg = f"{key} must be a whole number or .5 increment" + raise MetadataResolutionError(msg) + return series_index + + def optional_int(value: object, key: str) -> int | None: """Read an optional integer field.""" if value is None: diff --git a/python/tools/audiobook/metadata_agent.py b/python/tools/audiobook/metadata_agent.py index 1828c79..63a2035 100644 --- a/python/tools/audiobook/metadata_agent.py +++ b/python/tools/audiobook/metadata_agent.py @@ -19,6 +19,7 @@ from python.tools.audiobook.llm_tool_calling import ( optional_int, parse_tool_calls, required_int, + required_series_index, required_string, run_tool_calls, validate_catalog_slug, @@ -67,7 +68,7 @@ class StandardBookMetadata: title: str series_id: int | None series: str - series_index: int + series_index: float confidence: float needs_review: bool evidence: list[str] @@ -81,7 +82,7 @@ class FinalMetadataFields: book_id: int | None title: str series_id: int | None - series_index: int + series_index: float confidence: float evidence: list[str] @@ -93,7 +94,7 @@ class ResolvedBookFields: book_id: int | None title: str series_id: int | None - series_index: int + series_index: float @dataclass(frozen=True) @@ -283,7 +284,7 @@ class AudiobookMetadataAgent: "model": self._config.model, "messages": messages, "stream": False, - "options": {"temperature": 0}, + "options": {"temperature": 0.1}, } tool_names = [] if tools_enabled: @@ -403,7 +404,7 @@ class AudiobookMetadataAgent: series_index=book.series_index, ) - def validate_series(self, author_id: int, series_id: int | None, series_index: int) -> str: + def validate_series(self, author_id: int, series_id: int | None, series_index: float) -> str: """Validate final series fields and return the canonical series slug.""" if series_id is None: if series_index != 0: @@ -467,7 +468,9 @@ Rules: - The final JSON object must contain author_id, book_id, title, series_id, series_index, confidence, and evidence. - title must be a canonical title slug using lower-case words separated by hyphens. - Use series_id null and series_index 0 for standalone books. -- If you use a series_id, series_index must be an integer greater than or equal to 1. +- If you use a series_id, series_index must be a whole number or .5 value greater than 0. +- Treat series slugs that differ only by underscores as the same series. Prefer the existing catalog row instead of + creating a new series. - Detect omnibus or box-set editions that contain multiple numbered novels, books, or novellas. - For an omnibus, make a best-effort range from the filename, tags, and catalog rows. Keep series_index as the first covered book number and include the range in the title when the source title includes it, for example @@ -524,7 +527,7 @@ def parse_final_metadata_fields(raw_metadata: object) -> FinalMetadataFields: book_id=optional_int(data.get("book_id"), "book_id"), title=required_string(data, "title"), series_id=optional_int(data.get("series_id"), "series_id"), - series_index=required_int(data, "series_index"), + series_index=required_series_index(data, "series_index"), confidence=required_float(data, "confidence"), evidence=required_string_list(data, "evidence"), ) diff --git a/tests/test_audible_convert.py b/tests/test_audible_convert.py index 22cdbe3..28e9c5f 100644 --- a/tests/test_audible_convert.py +++ b/tests/test_audible_convert.py @@ -6,7 +6,8 @@ import json import subprocess import pytest -from sqlalchemy import create_engine +from sqlalchemy import create_engine, select +from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session, sessionmaker from python.orm.richie import Audiobook, AudiobookAuthor, AudiobookSeries, RichieBase @@ -113,6 +114,62 @@ def test_output_stem_uses_catalog_slugs() -> None: assert audible_convert.output_stem(metadata) == "glynn_stewart-starships_mage_01-title-slug" +def test_output_stem_formats_half_series_index() -> None: + metadata = StandardBookMetadata( + author_id=1, + author="glynn_stewart", + book_id=None, + title="title-slug", + series_id=1, + series="starships_mage", + series_index=1.5, + confidence=0.96, + needs_review=False, + evidence=["test"], + ) + + assert audible_convert.output_stem(metadata) == "glynn_stewart-starships_mage_01.5-title-slug" + + +@pytest.mark.parametrize( + ("metadata", "expected"), + [ + ( + StandardBookMetadata( + author_id=1, + author="mark_e_cooper", + book_id=None, + title="merkiaari-wars-series-books-1-3", + series_id=1, + series="merkiaari_wars", + series_index=1, + confidence=0.96, + needs_review=False, + evidence=["test"], + ), + "mark_e_cooper-merkiaari_wars_01-03-merkiaari-wars-series-books-1-3", + ), + ( + StandardBookMetadata( + author_id=1, + author="rhett_c_bruno", + book_id=None, + title="the-circuit-books-1-3", + series_id=1, + series="the_circuit", + series_index=1, + confidence=0.96, + needs_review=False, + evidence=["test"], + ), + "rhett_c_bruno-the_circuit_01-03-the-circuit-books-1-3", + ), + ], +) +def test_output_stem_formats_omnibus_book_range(metadata, expected) -> None: + assert audible_convert.output_stem(metadata) == expected + + def test_convert_aax_file_runs_ffmpeg(tmp_path, monkeypatch) -> None: """test_convert_aax_file_runs_ffmpeg.""" commands = [] @@ -196,6 +253,8 @@ def test_system_prompt_instructs_agent_to_detect_omnibuses() -> None: assert "Detect omnibus or box-set editions" in prompt assert "books-1-3" in prompt assert "Keep series_index as the" in prompt + assert "series_index must be a whole number or .5 value" in prompt + assert "differ only by underscores" in prompt def test_standard_book_metadata_accepts_valid_tool_output(tmp_path, monkeypatch, audiobook_engine) -> None: @@ -666,6 +725,85 @@ def test_standard_book_metadata_can_create_missing_catalog_rows( assert book.series_id == series.id +def test_standard_book_metadata_accepts_half_series_index(tmp_path, monkeypatch, audiobook_engine) -> None: + install_fake_ollama( + monkeypatch, + [ + tool_response("search_series", {"query": "bobiverse", "author_id": 4}), + final_response( + { + "author_id": 4, + "book_id": None, + "title": "bobiverse-short", + "series_id": 4, + "series_index": 1.5, + "confidence": 0.95, + "evidence": ["series novella from tags"], + }, + ), + ], + ) + + metadata = standard_book_metadata( + "Bobiverse Short.aax", + {"title": "Bobiverse Short", "artist": "Dennis E Taylor"}, + audiobook_engine, + tmp_path / "agent.jsonl", + "test-key", + config=metadata_agent.AgentConfig(), + ) + + assert metadata.series_index == 1.5 + assert metadata.needs_review is False + with Session(audiobook_engine) as session: + book = session.get(Audiobook, 1) + assert book.series_index == 1.5 + + +def test_standard_book_metadata_reuses_series_with_only_underscore_difference( + tmp_path, + monkeypatch, + audiobook_engine, +) -> None: + with Session(audiobook_engine) as session: + session.add(AudiobookSeries(id=5, name="starships", author_id=1)) + session.commit() + install_fake_ollama( + monkeypatch, + [ + tool_response("ensure_series", {"name": "starship_s", "author_id": 1}), + final_response( + { + "author_id": 1, + "book_id": None, + "title": "starships-short", + "series_id": 5, + "series_index": 1, + "confidence": 0.95, + "evidence": ["reused existing series with equivalent slug"], + }, + ), + ], + ) + + metadata = standard_book_metadata( + "Starship S Short.aax", + {"title": "Starship S Short", "artist": "Glynn Stewart"}, + audiobook_engine, + tmp_path / "agent.jsonl", + "test-key", + config=metadata_agent.AgentConfig(), + ) + + assert metadata.series == "starships" + with Session(audiobook_engine) as session: + series_names = session.scalars( + select(AudiobookSeries.name).where(AudiobookSeries.author_id == 1).order_by(AudiobookSeries.name), + ).all() + assert "starship_s" not in series_names + assert series_names == ["black_fleet_trilogy", "starships", "starships_mage"] + + def test_standard_book_metadata_normalizes_noisy_created_catalog_rows( tmp_path, monkeypatch, @@ -888,6 +1026,19 @@ def test_richie_exports_audiobook_models() -> None: assert Audiobook.__tablename__ == "audiobook" +def test_audiobook_title_author_series_is_unique(audiobook_engine) -> None: + with Session(audiobook_engine) as session: + session.add_all( + [ + Audiobook(title="duplicate-title", author_id=1, series_id=1, series_index=1), + Audiobook(title="duplicate-title", author_id=1, series_id=1, series_index=2), + ], + ) + + with pytest.raises(IntegrityError): + session.commit() + + def test_main_dry_run_prints_outputs_without_converting(tmp_path, monkeypatch, capsys) -> None: input_directory = tmp_path / "raw" output_directory = tmp_path / "audiobooks"