setting up whisper transcriber

2026-04-18 14:08:23 -04:00
parent dfe5997e0b
commit 7db063a240
7 changed files with 327 additions and 3 deletions
@@ -23,6 +23,7 @@
        apscheduler
        fastapi
        fastapi-cli
+        faster-whisper
        httpx
        mypy
        orjson
@@ -26,6 +26,7 @@ dependencies = [
 [project.scripts]
 database = "python.database_cli:app"
 van-inventory = "python.van_inventory.main:serve"
+whisper-transcribe = "python.tools.whisper.transcribe:main"

 [dependency-groups]
 dev = [
@@ -50,6 +51,7 @@ lint.ignore = [
    "COM812", # (TEMP) conflicts when used with the formatter
    "ISC001", # (TEMP) conflicts when used with the formatter
    "S603",   # (PERM) This is known to cause a false positive
+    "S607",   # (PERM) This is becoming a consistent annoyance
 ]

 [tool.ruff.lint.per-file-ignores]
@@ -78,9 +80,7 @@ lint.ignore = [
 "python/congress_tracker/**" = [
    "TC003", # (perm) this creates issues because sqlalchemy uses these at runtime
 ]
-"python/eval_warnings/**" = [
-    "S607", # (perm) gh and git are expected on PATH in the runner environment
-]
+
 "python/alembic/**" = [
    "INP001", # (perm) this creates LSP issues for alembic
 ]
@@ -0,0 +1,17 @@
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends python3 python3-pip ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --no-cache-dir --upgrade pip \
+    && pip3 install --no-cache-dir faster-whisper requests
+
+WORKDIR /app
+COPY python/tools/whisper/inference.py /app/inference.py
+
+ENTRYPOINT ["python3", "/app/inference.py"]
@@ -0,0 +1,2 @@
+*
+!python/tools/whisper/inference.py
@@ -0,0 +1 @@
+"""Whisper transcription tools (host orchestrator and container entrypoint)."""
@@ -0,0 +1,136 @@
+"""Container entrypoint that transcribes a directory of audio files with faster-whisper.
+
+Run inside the whisper-transcribe docker image; segment timestamps are grouped
+into one-minute buckets so the output reads as ``[HH:MM:00] text``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+
+from faster_whisper import WhisperModel
+
+logger = logging.getLogger(__name__)
+
+AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"}
+BUCKET_SECONDS = 60
+BEAM_SIZE = 5
+SECONDS_PER_HOUR = 3600
+SECONDS_PER_MINUTE = 60
+
+
+def format_timestamp(total_seconds: float) -> str:
+    """Render a whole-minute timestamp as ``HH:MM:00``.
+
+    Args:
+        total_seconds: Offset in seconds from the start of the audio.
+
+    Returns:
+        A zero-padded ``HH:MM:00`` string.
+    """
+    hours = int(total_seconds // SECONDS_PER_HOUR)
+    minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE)
+    return f"{hours:02d}:{minutes:02d}:00"
+
+
+def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None:
+    """Transcribe one audio file and write the bucketed transcript to disk.
+
+    Args:
+        model: Loaded faster-whisper model.
+        audio_path: Source audio file.
+        output_path: Destination ``.txt`` path.
+    """
+    logger.info("Transcribing %s", audio_path)
+    segments, info = model.transcribe(
+        str(audio_path),
+        language="en",
+        beam_size=BEAM_SIZE,
+        vad_filter=True,
+    )
+    logger.info("Duration %.1fs", info.duration)
+
+    buckets: dict[int, list[str]] = {}
+    for segment in segments:
+        bucket = int(segment.start // BUCKET_SECONDS)
+        buckets.setdefault(bucket, []).append(segment.text.strip())
+
+    lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)]
+    output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
+    logger.info("Wrote %s", output_path)
+
+
+def find_audio_files(input_directory: Path) -> list[Path]:
+    """Collect every audio file under ``input_directory``.
+
+    Args:
+        input_directory: Directory to walk recursively.
+
+    Returns:
+        Sorted list of audio file paths.
+    """
+    return sorted(
+        path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
+    )
+
+
+def configure_container_logger() -> None:
+    """Configure logging for the container (stdout, INFO)."""
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(message)s",
+    )
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse CLI arguments for the container entrypoint.
+
+    Returns:
+        Parsed argparse namespace.
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--input", type=Path, default=Path("/audio"))
+    parser.add_argument("--output", type=Path, default=Path("/output"))
+    parser.add_argument("--model", default="large-v3")
+    parser.add_argument(
+        "--download-only",
+        action="store_true",
+        help="Download the model into the cache volume and exit without transcribing.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Load the model, then either exit (download-only) or transcribe the directory."""
+    configure_container_logger()
+    arguments = parse_arguments()
+
+    logger.info("Loading model %s on CUDA", arguments.model)
+    model = WhisperModel(arguments.model, device="cuda", compute_type="float16")
+
+    if arguments.download_only:
+        logger.info("Model ready; exiting (download-only mode)")
+        return
+
+    arguments.output.mkdir(parents=True, exist_ok=True)
+
+    audio_files = find_audio_files(arguments.input)
+    if not audio_files:
+        logger.warning("No audio files found in %s", arguments.input)
+        return
+
+    logger.info("Found %d audio file(s)", len(audio_files))
+    for audio_path in audio_files:
+        relative = audio_path.relative_to(arguments.input)
+        output_path = arguments.output / relative.with_suffix(".txt")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        if output_path.exists():
+            logger.info("Skip %s (already transcribed)", relative)
+            continue
+        transcribe_file(model, audio_path, output_path)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,167 @@
+"""Build and run the whisper transcription docker container on demand.
+
+The container is started fresh for each invocation and removed on exit
+(``docker run --rm``). The model is cached in a named docker volume so
+only the first run pays the download cost.
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from pathlib import Path
+from typing import Annotated
+
+import typer
+
+from python.common import configure_logger
+
+logger = logging.getLogger(__name__)
+
+
+class Config:
+    """Paths and names for the whisper-transcribe Docker workflow."""
+
+    image_tag = "whisper-transcribe:latest"
+    model_volume = "whisper-models"
+    repo_root = Path(__file__).resolve().parents[3]
+    dockerfile = Path(__file__).resolve().parent / "Dockerfile"
+    huggingface_cache = "/root/.cache/huggingface"
+
+
+def run_docker(arguments: list[str]) -> None:
+    """Run a docker subcommand, streaming output and raising on failure.
+
+    Args:
+        arguments: Arguments to pass to the ``docker`` binary.
+
+    Raises:
+        subprocess.CalledProcessError: If docker exits non-zero.
+    """
+    logger.info("docker %s", " ".join(arguments))
+    subprocess.run(["docker", *arguments], check=True)
+
+
+def build_image() -> None:
+    """Build the whisper-transcribe image using the repo root as build context."""
+    logger.info("Building image %s", Config.image_tag)
+    run_docker(
+        [
+            "build",
+            "--tag",
+            Config.image_tag,
+            "--file",
+            str(Config.dockerfile),
+            str(Config.repo_root),
+        ],
+    )
+
+
+def model_cache_present(model: str) -> bool:
+    """Check whether the given model is already downloaded in the cache volume.
+
+    Args:
+        model: faster-whisper model name (e.g. ``large-v3``).
+
+    Returns:
+        True if the HuggingFace cache directory for the model exists in the volume.
+    """
+    cache_directory = f"hub/models--Systran--faster-whisper-{model}"
+    completed = subprocess.run(
+        [
+            "docker",
+            "run",
+            "--rm",
+            "--volume",
+            f"{Config.model_volume}:/cache",
+            "alpine",
+            "test",
+            "-d",
+            f"/cache/{cache_directory}",
+        ],
+        check=False,
+    )
+    return completed.returncode == 0
+
+
+def download_model(model: str) -> None:
+    """Download the model into the cache volume and exit.
+
+    Args:
+        model: faster-whisper model name.
+    """
+    logger.info("Downloading model %s into volume %s", model, Config.model_volume)
+    run_docker(
+        [
+            "run",
+            "--rm",
+            "--device=nvidia.com/gpu=all",
+            "--ipc=host",
+            "--volume",
+            f"{Config.model_volume}:{Config.huggingface_cache}",
+            Config.image_tag,
+            "--model",
+            model,
+            "--download-only",
+        ],
+    )
+
+
+def transcribe(input_directory: Path, output_directory: Path, model: str) -> None:
+    """Run transcription on every audio file under ``input_directory``.
+
+    Args:
+        input_directory: Host path containing audio files (mounted read-only).
+        output_directory: Host path for ``.txt`` transcripts.
+        model: faster-whisper model name.
+    """
+    logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model)
+    run_docker(
+        [
+            "run",
+            "--rm",
+            "--device=nvidia.com/gpu=all",
+            "--ipc=host",
+            "--volume",
+            f"{input_directory}:/audio:ro",
+            "--volume",
+            f"{output_directory}:/output",
+            "--volume",
+            f"{Config.model_volume}:{Config.huggingface_cache}",
+            Config.image_tag,
+            "--model",
+            model,
+        ],
+    )
+
+
+def main(
+    input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")],
+    output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")],
+    model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3",
+    *,
+    force_download: Annotated[
+        bool,
+        typer.Option("--force-download", help="Re-download the model even if already cached."),
+    ] = False,
+) -> None:
+    """Build the image, ensure the model is cached, then transcribe and stop."""
+    configure_logger()
+
+    resolved_input = input_directory.resolve(strict=True)
+    output_directory.mkdir(parents=True, exist_ok=True)
+    resolved_output = output_directory.resolve()
+
+    build_image()
+
+    if force_download or not model_cache_present(model):
+        download_model(model)
+    else:
+        logger.info("Model %s already cached in volume %s", model, Config.model_volume)
+
+    transcribe(resolved_input, resolved_output, model)
+    logger.info("Done. Container stopped.")
+
+
+if __name__ == "__main__":
+    typer.run(main)
				`@@ -0,0 +1 @@`
				`"""Whisper transcription tools (host orchestrator and container entrypoint)."""`