diff --git a/overlays/default.nix b/overlays/default.nix index 1bcdcd0..3c94185 100644 --- a/overlays/default.nix +++ b/overlays/default.nix @@ -23,6 +23,7 @@ apscheduler fastapi fastapi-cli + faster-whisper httpx mypy orjson diff --git a/pyproject.toml b/pyproject.toml index 17d17fb..e64b39d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ [project.scripts] database = "python.database_cli:app" van-inventory = "python.van_inventory.main:serve" +whisper-transcribe = "python.tools.whisper.transcribe:main" [dependency-groups] dev = [ @@ -50,6 +51,7 @@ lint.ignore = [ "COM812", # (TEMP) conflicts when used with the formatter "ISC001", # (TEMP) conflicts when used with the formatter "S603", # (PERM) This is known to cause a false positive + "S607", # (PERM) This is becoming a consistent annoyance ] [tool.ruff.lint.per-file-ignores] @@ -78,9 +80,7 @@ lint.ignore = [ "python/congress_tracker/**" = [ "TC003", # (perm) this creates issues because sqlalchemy uses these at runtime ] -"python/eval_warnings/**" = [ - "S607", # (perm) gh and git are expected on PATH in the runner environment -] + "python/alembic/**" = [ "INP001", # (perm) this creates LSP issues for alembic ] diff --git a/python/tools/whisper/Dockerfile b/python/tools/whisper/Dockerfile new file mode 100644 index 0000000..88db484 --- /dev/null +++ b/python/tools/whisper/Dockerfile @@ -0,0 +1,17 @@ +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends python3 python3-pip ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --no-cache-dir --upgrade pip \ + && pip3 install --no-cache-dir faster-whisper requests + +WORKDIR /app +COPY python/tools/whisper/inference.py /app/inference.py + +ENTRYPOINT ["python3", "/app/inference.py"] diff --git a/python/tools/whisper/Dockerfile.dockerignore b/python/tools/whisper/Dockerfile.dockerignore new file mode 100644 index 0000000..9e5dd2a --- /dev/null +++ b/python/tools/whisper/Dockerfile.dockerignore @@ -0,0 +1,2 @@ +* +!python/tools/whisper/inference.py diff --git a/python/tools/whisper/__init__.py b/python/tools/whisper/__init__.py new file mode 100644 index 0000000..73f181c --- /dev/null +++ b/python/tools/whisper/__init__.py @@ -0,0 +1 @@ +"""Whisper transcription tools (host orchestrator and container entrypoint).""" diff --git a/python/tools/whisper/inference.py b/python/tools/whisper/inference.py new file mode 100644 index 0000000..2c40d60 --- /dev/null +++ b/python/tools/whisper/inference.py @@ -0,0 +1,136 @@ +"""Container entrypoint that transcribes a directory of audio files with faster-whisper. + +Run inside the whisper-transcribe docker image; segment timestamps are grouped +into one-minute buckets so the output reads as ``[HH:MM:00] text``. +""" + +from __future__ import annotations + +import argparse +import logging +from pathlib import Path + +from faster_whisper import WhisperModel + +logger = logging.getLogger(__name__) + +AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"} +BUCKET_SECONDS = 60 +BEAM_SIZE = 5 +SECONDS_PER_HOUR = 3600 +SECONDS_PER_MINUTE = 60 + + +def format_timestamp(total_seconds: float) -> str: + """Render a whole-minute timestamp as ``HH:MM:00``. + + Args: + total_seconds: Offset in seconds from the start of the audio. + + Returns: + A zero-padded ``HH:MM:00`` string. + """ + hours = int(total_seconds // SECONDS_PER_HOUR) + minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE) + return f"{hours:02d}:{minutes:02d}:00" + + +def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None: + """Transcribe one audio file and write the bucketed transcript to disk. + + Args: + model: Loaded faster-whisper model. + audio_path: Source audio file. + output_path: Destination ``.txt`` path. + """ + logger.info("Transcribing %s", audio_path) + segments, info = model.transcribe( + str(audio_path), + language="en", + beam_size=BEAM_SIZE, + vad_filter=True, + ) + logger.info("Duration %.1fs", info.duration) + + buckets: dict[int, list[str]] = {} + for segment in segments: + bucket = int(segment.start // BUCKET_SECONDS) + buckets.setdefault(bucket, []).append(segment.text.strip()) + + lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)] + output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8") + logger.info("Wrote %s", output_path) + + +def find_audio_files(input_directory: Path) -> list[Path]: + """Collect every audio file under ``input_directory``. + + Args: + input_directory: Directory to walk recursively. + + Returns: + Sorted list of audio file paths. + """ + return sorted( + path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS + ) + + +def configure_container_logger() -> None: + """Configure logging for the container (stdout, INFO).""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + ) + + +def parse_arguments() -> argparse.Namespace: + """Parse CLI arguments for the container entrypoint. + + Returns: + Parsed argparse namespace. + """ + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input", type=Path, default=Path("/audio")) + parser.add_argument("--output", type=Path, default=Path("/output")) + parser.add_argument("--model", default="large-v3") + parser.add_argument( + "--download-only", + action="store_true", + help="Download the model into the cache volume and exit without transcribing.", + ) + return parser.parse_args() + + +def main() -> None: + """Load the model, then either exit (download-only) or transcribe the directory.""" + configure_container_logger() + arguments = parse_arguments() + + logger.info("Loading model %s on CUDA", arguments.model) + model = WhisperModel(arguments.model, device="cuda", compute_type="float16") + + if arguments.download_only: + logger.info("Model ready; exiting (download-only mode)") + return + + arguments.output.mkdir(parents=True, exist_ok=True) + + audio_files = find_audio_files(arguments.input) + if not audio_files: + logger.warning("No audio files found in %s", arguments.input) + return + + logger.info("Found %d audio file(s)", len(audio_files)) + for audio_path in audio_files: + relative = audio_path.relative_to(arguments.input) + output_path = arguments.output / relative.with_suffix(".txt") + output_path.parent.mkdir(parents=True, exist_ok=True) + if output_path.exists(): + logger.info("Skip %s (already transcribed)", relative) + continue + transcribe_file(model, audio_path, output_path) + + +if __name__ == "__main__": + main() diff --git a/python/tools/whisper/transcribe.py b/python/tools/whisper/transcribe.py new file mode 100644 index 0000000..2003696 --- /dev/null +++ b/python/tools/whisper/transcribe.py @@ -0,0 +1,167 @@ +"""Build and run the whisper transcription docker container on demand. + +The container is started fresh for each invocation and removed on exit +(``docker run --rm``). The model is cached in a named docker volume so +only the first run pays the download cost. +""" + +from __future__ import annotations + +import logging +import subprocess +from pathlib import Path +from typing import Annotated + +import typer + +from python.common import configure_logger + +logger = logging.getLogger(__name__) + + +class Config: + """Paths and names for the whisper-transcribe Docker workflow.""" + + image_tag = "whisper-transcribe:latest" + model_volume = "whisper-models" + repo_root = Path(__file__).resolve().parents[3] + dockerfile = Path(__file__).resolve().parent / "Dockerfile" + huggingface_cache = "/root/.cache/huggingface" + + +def run_docker(arguments: list[str]) -> None: + """Run a docker subcommand, streaming output and raising on failure. + + Args: + arguments: Arguments to pass to the ``docker`` binary. + + Raises: + subprocess.CalledProcessError: If docker exits non-zero. + """ + logger.info("docker %s", " ".join(arguments)) + subprocess.run(["docker", *arguments], check=True) + + +def build_image() -> None: + """Build the whisper-transcribe image using the repo root as build context.""" + logger.info("Building image %s", Config.image_tag) + run_docker( + [ + "build", + "--tag", + Config.image_tag, + "--file", + str(Config.dockerfile), + str(Config.repo_root), + ], + ) + + +def model_cache_present(model: str) -> bool: + """Check whether the given model is already downloaded in the cache volume. + + Args: + model: faster-whisper model name (e.g. ``large-v3``). + + Returns: + True if the HuggingFace cache directory for the model exists in the volume. + """ + cache_directory = f"hub/models--Systran--faster-whisper-{model}" + completed = subprocess.run( + [ + "docker", + "run", + "--rm", + "--volume", + f"{Config.model_volume}:/cache", + "alpine", + "test", + "-d", + f"/cache/{cache_directory}", + ], + check=False, + ) + return completed.returncode == 0 + + +def download_model(model: str) -> None: + """Download the model into the cache volume and exit. + + Args: + model: faster-whisper model name. + """ + logger.info("Downloading model %s into volume %s", model, Config.model_volume) + run_docker( + [ + "run", + "--rm", + "--device=nvidia.com/gpu=all", + "--ipc=host", + "--volume", + f"{Config.model_volume}:{Config.huggingface_cache}", + Config.image_tag, + "--model", + model, + "--download-only", + ], + ) + + +def transcribe(input_directory: Path, output_directory: Path, model: str) -> None: + """Run transcription on every audio file under ``input_directory``. + + Args: + input_directory: Host path containing audio files (mounted read-only). + output_directory: Host path for ``.txt`` transcripts. + model: faster-whisper model name. + """ + logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model) + run_docker( + [ + "run", + "--rm", + "--device=nvidia.com/gpu=all", + "--ipc=host", + "--volume", + f"{input_directory}:/audio:ro", + "--volume", + f"{output_directory}:/output", + "--volume", + f"{Config.model_volume}:{Config.huggingface_cache}", + Config.image_tag, + "--model", + model, + ], + ) + + +def main( + input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")], + output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")], + model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3", + *, + force_download: Annotated[ + bool, + typer.Option("--force-download", help="Re-download the model even if already cached."), + ] = False, +) -> None: + """Build the image, ensure the model is cached, then transcribe and stop.""" + configure_logger() + + resolved_input = input_directory.resolve(strict=True) + output_directory.mkdir(parents=True, exist_ok=True) + resolved_output = output_directory.resolve() + + build_image() + + if force_download or not model_cache_present(model): + download_model(model) + else: + logger.info("Model %s already cached in volume %s", model, Config.model_volume) + + transcribe(resolved_input, resolved_output, model) + logger.info("Done. Container stopped.") + + +if __name__ == "__main__": + typer.run(main)