69 lines
2.1 KiB
Python
69 lines
2.1 KiB
Python
"""OpenAI-compatible client for vLLM's API."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from typing import Self
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
READY_POLL_INTERVAL = 2.0
|
|
|
|
|
|
class VLLMClient:
|
|
"""Talk to a vLLM server via its OpenAI-compatible API.
|
|
|
|
Args:
|
|
host: vLLM host.
|
|
port: vLLM port.
|
|
timeout: Per-request timeout in seconds.
|
|
"""
|
|
|
|
def __init__(self, *, host: str = "localhost", port: int = 8000, timeout: int = 300) -> None:
|
|
"""Create a client connected to a vLLM server."""
|
|
self._client = httpx.Client(base_url=f"http://{host}:{port}", timeout=timeout)
|
|
|
|
def wait_ready(self, max_wait: int) -> None:
|
|
"""Poll /v1/models until the server is ready or timeout."""
|
|
deadline = time.monotonic() + max_wait
|
|
while time.monotonic() < deadline:
|
|
try:
|
|
response = self._client.get("/v1/models")
|
|
if response.is_success:
|
|
logger.info("vLLM server is ready")
|
|
return
|
|
except httpx.TransportError:
|
|
pass
|
|
time.sleep(READY_POLL_INTERVAL)
|
|
msg = f"vLLM server not ready after {max_wait}s"
|
|
raise TimeoutError(msg)
|
|
|
|
def complete(self, prompt: str, model: str, *, temperature: float = 0.0, max_tokens: int = 4096) -> str:
|
|
"""Send a prompt to /v1/completions and return the response text."""
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"temperature": temperature,
|
|
"max_tokens": max_tokens,
|
|
}
|
|
logger.info("Sending prompt to %s (%d chars)", model, len(prompt))
|
|
response = self._client.post("/v1/completions", json=payload)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data["choices"][0]["text"]
|
|
|
|
def close(self) -> None:
|
|
"""Close the HTTP client."""
|
|
self._client.close()
|
|
|
|
def __enter__(self) -> Self:
|
|
"""Enter the context manager."""
|
|
return self
|
|
|
|
def __exit__(self, *args: object) -> None:
|
|
"""Close the HTTP client on exit."""
|
|
self.close()
|