setting up whisper transcriber

2026-04-18 14:08:23 -04:00
parent dfe5997e0b
commit 7db063a240
7 changed files with 327 additions and 3 deletions
@@ -0,0 +1,167 @@
+"""Build and run the whisper transcription docker container on demand.
+
+The container is started fresh for each invocation and removed on exit
+(``docker run --rm``). The model is cached in a named docker volume so
+only the first run pays the download cost.
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from pathlib import Path
+from typing import Annotated
+
+import typer
+
+from python.common import configure_logger
+
+logger = logging.getLogger(__name__)
+
+
+class Config:
+    """Paths and names for the whisper-transcribe Docker workflow."""
+
+    image_tag = "whisper-transcribe:latest"
+    model_volume = "whisper-models"
+    repo_root = Path(__file__).resolve().parents[3]
+    dockerfile = Path(__file__).resolve().parent / "Dockerfile"
+    huggingface_cache = "/root/.cache/huggingface"
+
+
+def run_docker(arguments: list[str]) -> None:
+    """Run a docker subcommand, streaming output and raising on failure.
+
+    Args:
+        arguments: Arguments to pass to the ``docker`` binary.
+
+    Raises:
+        subprocess.CalledProcessError: If docker exits non-zero.
+    """
+    logger.info("docker %s", " ".join(arguments))
+    subprocess.run(["docker", *arguments], check=True)
+
+
+def build_image() -> None:
+    """Build the whisper-transcribe image using the repo root as build context."""
+    logger.info("Building image %s", Config.image_tag)
+    run_docker(
+        [
+            "build",
+            "--tag",
+            Config.image_tag,
+            "--file",
+            str(Config.dockerfile),
+            str(Config.repo_root),
+        ],
+    )
+
+
+def model_cache_present(model: str) -> bool:
+    """Check whether the given model is already downloaded in the cache volume.
+
+    Args:
+        model: faster-whisper model name (e.g. ``large-v3``).
+
+    Returns:
+        True if the HuggingFace cache directory for the model exists in the volume.
+    """
+    cache_directory = f"hub/models--Systran--faster-whisper-{model}"
+    completed = subprocess.run(
+        [
+            "docker",
+            "run",
+            "--rm",
+            "--volume",
+            f"{Config.model_volume}:/cache",
+            "alpine",
+            "test",
+            "-d",
+            f"/cache/{cache_directory}",
+        ],
+        check=False,
+    )
+    return completed.returncode == 0
+
+
+def download_model(model: str) -> None:
+    """Download the model into the cache volume and exit.
+
+    Args:
+        model: faster-whisper model name.
+    """
+    logger.info("Downloading model %s into volume %s", model, Config.model_volume)
+    run_docker(
+        [
+            "run",
+            "--rm",
+            "--device=nvidia.com/gpu=all",
+            "--ipc=host",
+            "--volume",
+            f"{Config.model_volume}:{Config.huggingface_cache}",
+            Config.image_tag,
+            "--model",
+            model,
+            "--download-only",
+        ],
+    )
+
+
+def transcribe(input_directory: Path, output_directory: Path, model: str) -> None:
+    """Run transcription on every audio file under ``input_directory``.
+
+    Args:
+        input_directory: Host path containing audio files (mounted read-only).
+        output_directory: Host path for ``.txt`` transcripts.
+        model: faster-whisper model name.
+    """
+    logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model)
+    run_docker(
+        [
+            "run",
+            "--rm",
+            "--device=nvidia.com/gpu=all",
+            "--ipc=host",
+            "--volume",
+            f"{input_directory}:/audio:ro",
+            "--volume",
+            f"{output_directory}:/output",
+            "--volume",
+            f"{Config.model_volume}:{Config.huggingface_cache}",
+            Config.image_tag,
+            "--model",
+            model,
+        ],
+    )
+
+
+def main(
+    input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")],
+    output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")],
+    model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3",
+    *,
+    force_download: Annotated[
+        bool,
+        typer.Option("--force-download", help="Re-download the model even if already cached."),
+    ] = False,
+) -> None:
+    """Build the image, ensure the model is cached, then transcribe and stop."""
+    configure_logger()
+
+    resolved_input = input_directory.resolve(strict=True)
+    output_directory.mkdir(parents=True, exist_ok=True)
+    resolved_output = output_directory.resolve()
+
+    build_image()
+
+    if force_download or not model_cache_present(model):
+        download_model(model)
+    else:
+        logger.info("Model %s already cached in volume %s", model, Config.model_volume)
+
+    transcribe(resolved_input, resolved_output, model)
+    logger.info("Done. Container stopped.")
+
+
+if __name__ == "__main__":
+    typer.run(main)