"""Build and run the whisper transcription docker container on demand. The container is started fresh for each invocation and removed on exit (``docker run --rm``). The model is cached in a named docker volume so only the first run pays the download cost. """ from __future__ import annotations import logging import subprocess from pathlib import Path from typing import Annotated import typer from python.common import configure_logger logger = logging.getLogger(__name__) class Config: """Paths and names for the whisper-transcribe Docker workflow.""" image_tag = "whisper-transcribe:latest" model_volume = "whisper-models" repo_root = Path(__file__).resolve().parents[3] dockerfile = Path(__file__).resolve().parent / "Dockerfile" huggingface_cache = "/root/.cache/huggingface" def run_docker(arguments: list[str]) -> None: """Run a docker subcommand, streaming output and raising on failure. Args: arguments: Arguments to pass to the ``docker`` binary. Raises: subprocess.CalledProcessError: If docker exits non-zero. """ logger.info("docker %s", " ".join(arguments)) subprocess.run(["docker", *arguments], check=True) def build_image() -> None: """Build the whisper-transcribe image using the repo root as build context.""" logger.info("Building image %s", Config.image_tag) run_docker( [ "build", "--tag", Config.image_tag, "--file", str(Config.dockerfile), str(Config.repo_root), ], ) def model_cache_present(model: str) -> bool: """Check whether the given model is already downloaded in the cache volume. Args: model: faster-whisper model name (e.g. ``large-v3``). Returns: True if the HuggingFace cache directory for the model exists in the volume. """ cache_directory = f"hub/models--Systran--faster-whisper-{model}" completed = subprocess.run( [ "docker", "run", "--rm", "--volume", f"{Config.model_volume}:/cache", "alpine", "test", "-d", f"/cache/{cache_directory}", ], check=False, ) return completed.returncode == 0 def download_model(model: str) -> None: """Download the model into the cache volume and exit. Args: model: faster-whisper model name. """ logger.info("Downloading model %s into volume %s", model, Config.model_volume) run_docker( [ "run", "--rm", "--device=nvidia.com/gpu=all", "--ipc=host", "--volume", f"{Config.model_volume}:{Config.huggingface_cache}", Config.image_tag, "--model", model, "--download-only", ], ) def transcribe(input_directory: Path, output_directory: Path, model: str) -> None: """Run transcription on every audio file under ``input_directory``. Args: input_directory: Host path containing audio files (mounted read-only). output_directory: Host path for ``.txt`` transcripts. model: faster-whisper model name. """ logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model) run_docker( [ "run", "--rm", "--device=nvidia.com/gpu=all", "--ipc=host", "--volume", f"{input_directory}:/audio:ro", "--volume", f"{output_directory}:/output", "--volume", f"{Config.model_volume}:{Config.huggingface_cache}", Config.image_tag, "--model", model, ], ) def main( input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")], output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")], model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3", *, force_download: Annotated[ bool, typer.Option("--force-download", help="Re-download the model even if already cached."), ] = False, ) -> None: """Build the image, ensure the model is cached, then transcribe and stop.""" configure_logger() resolved_input = input_directory.resolve(strict=True) output_directory.mkdir(parents=True, exist_ok=True) resolved_output = output_directory.resolve() build_image() if force_download or not model_cache_present(model): download_model(model) else: logger.info("Model %s already cached in volume %s", model, Config.model_volume) transcribe(resolved_input, resolved_output, model) logger.info("Done. Container stopped.") if __name__ == "__main__": typer.run(main)