moved renamed container.py to vllm_container.py

2026-04-17 21:18:18 -04:00 · 2026-04-10 13:16:18 -04:00
parent 721526022b
commit 03208a1ab2
3 changed files with 10 additions and 4 deletions
--- a/python/prompt_bench/vllm_container.py
+++ b/python/prompt_bench/vllm_container.py
@@ -0,0 +1,87 @@
+"""Docker container lifecycle management for vLLM."""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+
+logger = logging.getLogger(__name__)
+
+CONTAINER_NAME = "vllm-bench"
+VLLM_IMAGE = "vllm/vllm-openai:v0.19.0"
+
+
+def start_vllm(
+    *,
+    model: str,
+    port: int,
+    model_dir: str,
+    gpu_memory_utilization: float,
+) -> None:
+    """Start a vLLM container serving the given model.
+
+    Args:
+        model: HuggingFace model directory name (relative to model_dir).
+        port: Host port to bind.
+        model_dir: Host path containing HuggingFace model directories.
+        gpu_memory_utilization: Fraction of GPU memory to use (0-1).
+    """
+    command = [
+        "docker",
+        "run",
+        "-d",
+        "--name",
+        CONTAINER_NAME,
+        "--device=nvidia.com/gpu=all",
+        "--ipc=host",
+        "-v",
+        f"{model_dir}:/models",
+        "-p",
+        f"{port}:8000",
+        VLLM_IMAGE,
+        "--model",
+        f"/models/{model}",
+        "--served-model-name",
+        model,
+        "--gpu-memory-utilization",
+        str(gpu_memory_utilization),
+        "--max-model-len",
+        "4096",
+    ]
+    logger.info("Starting vLLM container with model: %s", model)
+    stop_vllm()
+    result = subprocess.run(command, capture_output=True, text=True, check=False)
+    if result.returncode != 0:
+        msg = f"Failed to start vLLM container: {result.stderr.strip()}"
+        raise RuntimeError(msg)
+    logger.info("vLLM container started: %s", result.stdout.strip()[:12])
+
+
+def stop_vllm() -> None:
+    """Stop and remove the vLLM benchmark container."""
+    logger.info("Stopping vLLM container")
+    subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
+    subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
+    subprocess.run(
+        ["docker", "network", "disconnect", "-f", "bridge", CONTAINER_NAME],
+        capture_output=True,
+        check=False,
+    )
+    logger.info("vLLM container stopped and removed")
+
+
+def check_gpu_free() -> None:
+    """Warn if GPU-heavy processes (e.g. Ollama) are running."""
+    result = subprocess.run(
+        ["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        logger.warning("Could not query GPU processes: %s", result.stderr.strip())
+        return
+    processes = result.stdout.strip()
+    if processes:
+        logger.warning("GPU processes detected:\n%s", processes)
+        logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")