mirror of
https://github.com/RichieCahill/dotfiles.git
synced 2026-04-19 13:49:09 -04:00
setting up whisper transcriber
This commit is contained in:
@@ -23,6 +23,7 @@
|
|||||||
apscheduler
|
apscheduler
|
||||||
fastapi
|
fastapi
|
||||||
fastapi-cli
|
fastapi-cli
|
||||||
|
faster-whisper
|
||||||
httpx
|
httpx
|
||||||
mypy
|
mypy
|
||||||
orjson
|
orjson
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ dependencies = [
|
|||||||
[project.scripts]
|
[project.scripts]
|
||||||
database = "python.database_cli:app"
|
database = "python.database_cli:app"
|
||||||
van-inventory = "python.van_inventory.main:serve"
|
van-inventory = "python.van_inventory.main:serve"
|
||||||
|
whisper-transcribe = "python.tools.whisper.transcribe:main"
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
@@ -50,6 +51,7 @@ lint.ignore = [
|
|||||||
"COM812", # (TEMP) conflicts when used with the formatter
|
"COM812", # (TEMP) conflicts when used with the formatter
|
||||||
"ISC001", # (TEMP) conflicts when used with the formatter
|
"ISC001", # (TEMP) conflicts when used with the formatter
|
||||||
"S603", # (PERM) This is known to cause a false positive
|
"S603", # (PERM) This is known to cause a false positive
|
||||||
|
"S607", # (PERM) This is becoming a consistent annoyance
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
@@ -78,9 +80,7 @@ lint.ignore = [
|
|||||||
"python/congress_tracker/**" = [
|
"python/congress_tracker/**" = [
|
||||||
"TC003", # (perm) this creates issues because sqlalchemy uses these at runtime
|
"TC003", # (perm) this creates issues because sqlalchemy uses these at runtime
|
||||||
]
|
]
|
||||||
"python/eval_warnings/**" = [
|
|
||||||
"S607", # (perm) gh and git are expected on PATH in the runner environment
|
|
||||||
]
|
|
||||||
"python/alembic/**" = [
|
"python/alembic/**" = [
|
||||||
"INP001", # (perm) this creates LSP issues for alembic
|
"INP001", # (perm) this creates LSP issues for alembic
|
||||||
]
|
]
|
||||||
|
|||||||
17
python/tools/whisper/Dockerfile
Normal file
17
python/tools/whisper/Dockerfile
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends python3 python3-pip ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN pip3 install --no-cache-dir --upgrade pip \
|
||||||
|
&& pip3 install --no-cache-dir faster-whisper requests
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY python/tools/whisper/inference.py /app/inference.py
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "/app/inference.py"]
|
||||||
2
python/tools/whisper/Dockerfile.dockerignore
Normal file
2
python/tools/whisper/Dockerfile.dockerignore
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
*
|
||||||
|
!python/tools/whisper/inference.py
|
||||||
1
python/tools/whisper/__init__.py
Normal file
1
python/tools/whisper/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""Whisper transcription tools (host orchestrator and container entrypoint)."""
|
||||||
136
python/tools/whisper/inference.py
Normal file
136
python/tools/whisper/inference.py
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
"""Container entrypoint that transcribes a directory of audio files with faster-whisper.
|
||||||
|
|
||||||
|
Run inside the whisper-transcribe docker image; segment timestamps are grouped
|
||||||
|
into one-minute buckets so the output reads as ``[HH:MM:00] text``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"}
|
||||||
|
BUCKET_SECONDS = 60
|
||||||
|
BEAM_SIZE = 5
|
||||||
|
SECONDS_PER_HOUR = 3600
|
||||||
|
SECONDS_PER_MINUTE = 60
|
||||||
|
|
||||||
|
|
||||||
|
def format_timestamp(total_seconds: float) -> str:
|
||||||
|
"""Render a whole-minute timestamp as ``HH:MM:00``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
total_seconds: Offset in seconds from the start of the audio.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A zero-padded ``HH:MM:00`` string.
|
||||||
|
"""
|
||||||
|
hours = int(total_seconds // SECONDS_PER_HOUR)
|
||||||
|
minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE)
|
||||||
|
return f"{hours:02d}:{minutes:02d}:00"
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None:
|
||||||
|
"""Transcribe one audio file and write the bucketed transcript to disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: Loaded faster-whisper model.
|
||||||
|
audio_path: Source audio file.
|
||||||
|
output_path: Destination ``.txt`` path.
|
||||||
|
"""
|
||||||
|
logger.info("Transcribing %s", audio_path)
|
||||||
|
segments, info = model.transcribe(
|
||||||
|
str(audio_path),
|
||||||
|
language="en",
|
||||||
|
beam_size=BEAM_SIZE,
|
||||||
|
vad_filter=True,
|
||||||
|
)
|
||||||
|
logger.info("Duration %.1fs", info.duration)
|
||||||
|
|
||||||
|
buckets: dict[int, list[str]] = {}
|
||||||
|
for segment in segments:
|
||||||
|
bucket = int(segment.start // BUCKET_SECONDS)
|
||||||
|
buckets.setdefault(bucket, []).append(segment.text.strip())
|
||||||
|
|
||||||
|
lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)]
|
||||||
|
output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
|
||||||
|
logger.info("Wrote %s", output_path)
|
||||||
|
|
||||||
|
|
||||||
|
def find_audio_files(input_directory: Path) -> list[Path]:
|
||||||
|
"""Collect every audio file under ``input_directory``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_directory: Directory to walk recursively.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sorted list of audio file paths.
|
||||||
|
"""
|
||||||
|
return sorted(
|
||||||
|
path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def configure_container_logger() -> None:
|
||||||
|
"""Configure logging for the container (stdout, INFO)."""
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments() -> argparse.Namespace:
|
||||||
|
"""Parse CLI arguments for the container entrypoint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Parsed argparse namespace.
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument("--input", type=Path, default=Path("/audio"))
|
||||||
|
parser.add_argument("--output", type=Path, default=Path("/output"))
|
||||||
|
parser.add_argument("--model", default="large-v3")
|
||||||
|
parser.add_argument(
|
||||||
|
"--download-only",
|
||||||
|
action="store_true",
|
||||||
|
help="Download the model into the cache volume and exit without transcribing.",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Load the model, then either exit (download-only) or transcribe the directory."""
|
||||||
|
configure_container_logger()
|
||||||
|
arguments = parse_arguments()
|
||||||
|
|
||||||
|
logger.info("Loading model %s on CUDA", arguments.model)
|
||||||
|
model = WhisperModel(arguments.model, device="cuda", compute_type="float16")
|
||||||
|
|
||||||
|
if arguments.download_only:
|
||||||
|
logger.info("Model ready; exiting (download-only mode)")
|
||||||
|
return
|
||||||
|
|
||||||
|
arguments.output.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
audio_files = find_audio_files(arguments.input)
|
||||||
|
if not audio_files:
|
||||||
|
logger.warning("No audio files found in %s", arguments.input)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Found %d audio file(s)", len(audio_files))
|
||||||
|
for audio_path in audio_files:
|
||||||
|
relative = audio_path.relative_to(arguments.input)
|
||||||
|
output_path = arguments.output / relative.with_suffix(".txt")
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if output_path.exists():
|
||||||
|
logger.info("Skip %s (already transcribed)", relative)
|
||||||
|
continue
|
||||||
|
transcribe_file(model, audio_path, output_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
167
python/tools/whisper/transcribe.py
Normal file
167
python/tools/whisper/transcribe.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
"""Build and run the whisper transcription docker container on demand.
|
||||||
|
|
||||||
|
The container is started fresh for each invocation and removed on exit
|
||||||
|
(``docker run --rm``). The model is cached in a named docker volume so
|
||||||
|
only the first run pays the download cost.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from python.common import configure_logger
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
"""Paths and names for the whisper-transcribe Docker workflow."""
|
||||||
|
|
||||||
|
image_tag = "whisper-transcribe:latest"
|
||||||
|
model_volume = "whisper-models"
|
||||||
|
repo_root = Path(__file__).resolve().parents[3]
|
||||||
|
dockerfile = Path(__file__).resolve().parent / "Dockerfile"
|
||||||
|
huggingface_cache = "/root/.cache/huggingface"
|
||||||
|
|
||||||
|
|
||||||
|
def run_docker(arguments: list[str]) -> None:
|
||||||
|
"""Run a docker subcommand, streaming output and raising on failure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
arguments: Arguments to pass to the ``docker`` binary.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
subprocess.CalledProcessError: If docker exits non-zero.
|
||||||
|
"""
|
||||||
|
logger.info("docker %s", " ".join(arguments))
|
||||||
|
subprocess.run(["docker", *arguments], check=True)
|
||||||
|
|
||||||
|
|
||||||
|
def build_image() -> None:
|
||||||
|
"""Build the whisper-transcribe image using the repo root as build context."""
|
||||||
|
logger.info("Building image %s", Config.image_tag)
|
||||||
|
run_docker(
|
||||||
|
[
|
||||||
|
"build",
|
||||||
|
"--tag",
|
||||||
|
Config.image_tag,
|
||||||
|
"--file",
|
||||||
|
str(Config.dockerfile),
|
||||||
|
str(Config.repo_root),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def model_cache_present(model: str) -> bool:
|
||||||
|
"""Check whether the given model is already downloaded in the cache volume.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: faster-whisper model name (e.g. ``large-v3``).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the HuggingFace cache directory for the model exists in the volume.
|
||||||
|
"""
|
||||||
|
cache_directory = f"hub/models--Systran--faster-whisper-{model}"
|
||||||
|
completed = subprocess.run(
|
||||||
|
[
|
||||||
|
"docker",
|
||||||
|
"run",
|
||||||
|
"--rm",
|
||||||
|
"--volume",
|
||||||
|
f"{Config.model_volume}:/cache",
|
||||||
|
"alpine",
|
||||||
|
"test",
|
||||||
|
"-d",
|
||||||
|
f"/cache/{cache_directory}",
|
||||||
|
],
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
return completed.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def download_model(model: str) -> None:
|
||||||
|
"""Download the model into the cache volume and exit.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: faster-whisper model name.
|
||||||
|
"""
|
||||||
|
logger.info("Downloading model %s into volume %s", model, Config.model_volume)
|
||||||
|
run_docker(
|
||||||
|
[
|
||||||
|
"run",
|
||||||
|
"--rm",
|
||||||
|
"--device=nvidia.com/gpu=all",
|
||||||
|
"--ipc=host",
|
||||||
|
"--volume",
|
||||||
|
f"{Config.model_volume}:{Config.huggingface_cache}",
|
||||||
|
Config.image_tag,
|
||||||
|
"--model",
|
||||||
|
model,
|
||||||
|
"--download-only",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe(input_directory: Path, output_directory: Path, model: str) -> None:
|
||||||
|
"""Run transcription on every audio file under ``input_directory``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_directory: Host path containing audio files (mounted read-only).
|
||||||
|
output_directory: Host path for ``.txt`` transcripts.
|
||||||
|
model: faster-whisper model name.
|
||||||
|
"""
|
||||||
|
logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model)
|
||||||
|
run_docker(
|
||||||
|
[
|
||||||
|
"run",
|
||||||
|
"--rm",
|
||||||
|
"--device=nvidia.com/gpu=all",
|
||||||
|
"--ipc=host",
|
||||||
|
"--volume",
|
||||||
|
f"{input_directory}:/audio:ro",
|
||||||
|
"--volume",
|
||||||
|
f"{output_directory}:/output",
|
||||||
|
"--volume",
|
||||||
|
f"{Config.model_volume}:{Config.huggingface_cache}",
|
||||||
|
Config.image_tag,
|
||||||
|
"--model",
|
||||||
|
model,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main(
|
||||||
|
input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")],
|
||||||
|
output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")],
|
||||||
|
model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3",
|
||||||
|
*,
|
||||||
|
force_download: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option("--force-download", help="Re-download the model even if already cached."),
|
||||||
|
] = False,
|
||||||
|
) -> None:
|
||||||
|
"""Build the image, ensure the model is cached, then transcribe and stop."""
|
||||||
|
configure_logger()
|
||||||
|
|
||||||
|
resolved_input = input_directory.resolve(strict=True)
|
||||||
|
output_directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
resolved_output = output_directory.resolve()
|
||||||
|
|
||||||
|
build_image()
|
||||||
|
|
||||||
|
if force_download or not model_cache_present(model):
|
||||||
|
download_model(model)
|
||||||
|
else:
|
||||||
|
logger.info("Model %s already cached in volume %s", model, Config.model_volume)
|
||||||
|
|
||||||
|
transcribe(resolved_input, resolved_output, model)
|
||||||
|
logger.info("Done. Container stopped.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
typer.run(main)
|
||||||
Reference in New Issue
Block a user