pipelines/prompt_bench/vllm_client.py

"""OpenAI-compatible client for vLLM's API."""

from __future__ import annotations

import logging
import time
from typing import Self

import httpx

logger = logging.getLogger(__name__)

READY_POLL_INTERVAL = 2.0


class VLLMClient:
    """Talk to a vLLM server via its OpenAI-compatible API.

    Args:
        host: vLLM host.
        port: vLLM port.
        timeout: Per-request timeout in seconds.
    """

    def __init__(self, *, host: str = "localhost", port: int = 8000, timeout: int = 300) -> None:
        """Create a client connected to a vLLM server."""
        self._client = httpx.Client(base_url=f"http://{host}:{port}", timeout=timeout)

    def wait_ready(self, max_wait: int) -> None:
        """Poll /v1/models until the server is ready or timeout."""
        deadline = time.monotonic() + max_wait
        while time.monotonic() < deadline:
            try:
                response = self._client.get("/v1/models")
                if response.is_success:
                    logger.info("vLLM server is ready")
                    return
            except httpx.TransportError:
                pass
            time.sleep(READY_POLL_INTERVAL)
        msg = f"vLLM server not ready after {max_wait}s"
        raise TimeoutError(msg)

    def complete(self, prompt: str, model: str, *, temperature: float = 0.0, max_tokens: int = 4096) -> str:
        """Send a prompt to /v1/completions and return the response text."""
        payload = {
            "model": model,
            "prompt": prompt,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }
        logger.info("Sending prompt to %s (%d chars)", model, len(prompt))
        response = self._client.post("/v1/completions", json=payload)
        response.raise_for_status()
        data = response.json()
        return data["choices"][0]["text"]

    def close(self) -> None:
        """Close the HTTP client."""
        self._client.close()

    def __enter__(self) -> Self:
        """Enter the context manager."""
        return self

    def __exit__(self, *args: object) -> None:
        """Close the HTTP client on exit."""
        self.close()