mirror of
https://github.com/RichieCahill/dotfiles.git
synced 2026-04-19 13:49:09 -04:00
137 lines
4.3 KiB
Python
137 lines
4.3 KiB
Python
"""Container entrypoint that transcribes a directory of audio files with faster-whisper.
|
|
|
|
Run inside the whisper-transcribe docker image; segment timestamps are grouped
|
|
into one-minute buckets so the output reads as ``[HH:MM:00] text``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
from faster_whisper import WhisperModel
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"}
|
|
BUCKET_SECONDS = 60
|
|
BEAM_SIZE = 5
|
|
SECONDS_PER_HOUR = 3600
|
|
SECONDS_PER_MINUTE = 60
|
|
|
|
|
|
def format_timestamp(total_seconds: float) -> str:
|
|
"""Render a whole-minute timestamp as ``HH:MM:00``.
|
|
|
|
Args:
|
|
total_seconds: Offset in seconds from the start of the audio.
|
|
|
|
Returns:
|
|
A zero-padded ``HH:MM:00`` string.
|
|
"""
|
|
hours = int(total_seconds // SECONDS_PER_HOUR)
|
|
minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE)
|
|
return f"{hours:02d}:{minutes:02d}:00"
|
|
|
|
|
|
def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None:
|
|
"""Transcribe one audio file and write the bucketed transcript to disk.
|
|
|
|
Args:
|
|
model: Loaded faster-whisper model.
|
|
audio_path: Source audio file.
|
|
output_path: Destination ``.txt`` path.
|
|
"""
|
|
logger.info("Transcribing %s", audio_path)
|
|
segments, info = model.transcribe(
|
|
str(audio_path),
|
|
language="en",
|
|
beam_size=BEAM_SIZE,
|
|
vad_filter=True,
|
|
)
|
|
logger.info("Duration %.1fs", info.duration)
|
|
|
|
buckets: dict[int, list[str]] = {}
|
|
for segment in segments:
|
|
bucket = int(segment.start // BUCKET_SECONDS)
|
|
buckets.setdefault(bucket, []).append(segment.text.strip())
|
|
|
|
lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)]
|
|
output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
|
|
logger.info("Wrote %s", output_path)
|
|
|
|
|
|
def find_audio_files(input_directory: Path) -> list[Path]:
|
|
"""Collect every audio file under ``input_directory``.
|
|
|
|
Args:
|
|
input_directory: Directory to walk recursively.
|
|
|
|
Returns:
|
|
Sorted list of audio file paths.
|
|
"""
|
|
return sorted(
|
|
path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
|
|
)
|
|
|
|
|
|
def configure_container_logger() -> None:
|
|
"""Configure logging for the container (stdout, INFO)."""
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
)
|
|
|
|
|
|
def parse_arguments() -> argparse.Namespace:
|
|
"""Parse CLI arguments for the container entrypoint.
|
|
|
|
Returns:
|
|
Parsed argparse namespace.
|
|
"""
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--input", type=Path, default=Path("/audio"))
|
|
parser.add_argument("--output", type=Path, default=Path("/output"))
|
|
parser.add_argument("--model", default="large-v3")
|
|
parser.add_argument(
|
|
"--download-only",
|
|
action="store_true",
|
|
help="Download the model into the cache volume and exit without transcribing.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
"""Load the model, then either exit (download-only) or transcribe the directory."""
|
|
configure_container_logger()
|
|
arguments = parse_arguments()
|
|
|
|
logger.info("Loading model %s on CUDA", arguments.model)
|
|
model = WhisperModel(arguments.model, device="cuda", compute_type="float16")
|
|
|
|
if arguments.download_only:
|
|
logger.info("Model ready; exiting (download-only mode)")
|
|
return
|
|
|
|
arguments.output.mkdir(parents=True, exist_ok=True)
|
|
|
|
audio_files = find_audio_files(arguments.input)
|
|
if not audio_files:
|
|
logger.warning("No audio files found in %s", arguments.input)
|
|
return
|
|
|
|
logger.info("Found %d audio file(s)", len(audio_files))
|
|
for audio_path in audio_files:
|
|
relative = audio_path.relative_to(arguments.input)
|
|
output_path = arguments.output / relative.with_suffix(".txt")
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
if output_path.exists():
|
|
logger.info("Skip %s (already transcribed)", relative)
|
|
continue
|
|
transcribe_file(model, audio_path, output_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|