dotfiles/python/prompt_bench/container.py

"""Docker container lifecycle management for vLLM."""

from __future__ import annotations

import logging
import subprocess

logger = logging.getLogger(__name__)

CONTAINER_NAME = "vllm-bench"
VLLM_IMAGE = "vllm/vllm-openai:v0.8.5"


def start_vllm(
    *,
    model: str,
    port: int,
    model_dir: str,
    gpu_memory_utilization: float,
) -> None:
    """Start a vLLM container serving the given model.

    Args:
        model: HuggingFace model directory name (relative to model_dir).
        port: Host port to bind.
        model_dir: Host path containing HuggingFace model directories.
        gpu_memory_utilization: Fraction of GPU memory to use (0-1).
    """
    command = [
        "docker",
        "run",
        "-d",
        "--name",
        CONTAINER_NAME,
        "--device=nvidia.com/gpu=all",
        "--ipc=host",
        "-v",
        f"{model_dir}:/models",
        "-p",
        f"{port}:8000",
        VLLM_IMAGE,
        "--model",
        f"/models/{model}",
        "--served-model-name",
        model,
        "--gpu-memory-utilization",
        str(gpu_memory_utilization),
        "--max-model-len",
        "4096",
    ]
    logger.info("Starting vLLM container with model: %s", model)
    result = subprocess.run(command, capture_output=True, text=True, check=False)
    if result.returncode != 0:
        msg = f"Failed to start vLLM container: {result.stderr.strip()}"
        raise RuntimeError(msg)
    logger.info("vLLM container started: %s", result.stdout.strip()[:12])


def stop_vllm() -> None:
    """Stop and remove the vLLM benchmark container."""
    logger.info("Stopping vLLM container")
    subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
    subprocess.run(["docker", "rm", CONTAINER_NAME], capture_output=True, check=False)
    logger.info("vLLM container stopped and removed")


def check_gpu_free() -> None:
    """Warn if GPU-heavy processes (e.g. Ollama) are running."""
    result = subprocess.run(
        ["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        logger.warning("Could not query GPU processes: %s", result.stderr.strip())
        return
    processes = result.stdout.strip()
    if processes:
        logger.warning("GPU processes detected:\n%s", processes)
        logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")