setting up whisper transcriber

This commit is contained in:
2026-04-18 14:08:23 -04:00
parent dfe5997e0b
commit 7db063a240
7 changed files with 327 additions and 3 deletions

View File

@@ -0,0 +1,136 @@
"""Container entrypoint that transcribes a directory of audio files with faster-whisper.
Run inside the whisper-transcribe docker image; segment timestamps are grouped
into one-minute buckets so the output reads as ``[HH:MM:00] text``.
"""
from __future__ import annotations
import argparse
import logging
from pathlib import Path
from faster_whisper import WhisperModel
logger = logging.getLogger(__name__)
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"}
BUCKET_SECONDS = 60
BEAM_SIZE = 5
SECONDS_PER_HOUR = 3600
SECONDS_PER_MINUTE = 60
def format_timestamp(total_seconds: float) -> str:
"""Render a whole-minute timestamp as ``HH:MM:00``.
Args:
total_seconds: Offset in seconds from the start of the audio.
Returns:
A zero-padded ``HH:MM:00`` string.
"""
hours = int(total_seconds // SECONDS_PER_HOUR)
minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE)
return f"{hours:02d}:{minutes:02d}:00"
def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None:
"""Transcribe one audio file and write the bucketed transcript to disk.
Args:
model: Loaded faster-whisper model.
audio_path: Source audio file.
output_path: Destination ``.txt`` path.
"""
logger.info("Transcribing %s", audio_path)
segments, info = model.transcribe(
str(audio_path),
language="en",
beam_size=BEAM_SIZE,
vad_filter=True,
)
logger.info("Duration %.1fs", info.duration)
buckets: dict[int, list[str]] = {}
for segment in segments:
bucket = int(segment.start // BUCKET_SECONDS)
buckets.setdefault(bucket, []).append(segment.text.strip())
lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)]
output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
logger.info("Wrote %s", output_path)
def find_audio_files(input_directory: Path) -> list[Path]:
"""Collect every audio file under ``input_directory``.
Args:
input_directory: Directory to walk recursively.
Returns:
Sorted list of audio file paths.
"""
return sorted(
path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
)
def configure_container_logger() -> None:
"""Configure logging for the container (stdout, INFO)."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
def parse_arguments() -> argparse.Namespace:
"""Parse CLI arguments for the container entrypoint.
Returns:
Parsed argparse namespace.
"""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input", type=Path, default=Path("/audio"))
parser.add_argument("--output", type=Path, default=Path("/output"))
parser.add_argument("--model", default="large-v3")
parser.add_argument(
"--download-only",
action="store_true",
help="Download the model into the cache volume and exit without transcribing.",
)
return parser.parse_args()
def main() -> None:
"""Load the model, then either exit (download-only) or transcribe the directory."""
configure_container_logger()
arguments = parse_arguments()
logger.info("Loading model %s on CUDA", arguments.model)
model = WhisperModel(arguments.model, device="cuda", compute_type="float16")
if arguments.download_only:
logger.info("Model ready; exiting (download-only mode)")
return
arguments.output.mkdir(parents=True, exist_ok=True)
audio_files = find_audio_files(arguments.input)
if not audio_files:
logger.warning("No audio files found in %s", arguments.input)
return
logger.info("Found %d audio file(s)", len(audio_files))
for audio_path in audio_files:
relative = audio_path.relative_to(arguments.input)
output_path = arguments.output / relative.with_suffix(".txt")
output_path.parent.mkdir(parents=True, exist_ok=True)
if output_path.exists():
logger.info("Skip %s (already transcribed)", relative)
continue
transcribe_file(model, audio_path, output_path)
if __name__ == "__main__":
main()