feat(ebook-search): add load-test CLI for the search service

Add a Typer CLI script that drives POST /search on a running server at a
configurable concurrency and reports latency percentiles (p50/p90/p95/p99),
throughput, and HTTP status distribution. Queries are drawn from the shared
eval JSONL set so load testing and evaluation exercise the same questions.
This commit is contained in:
2026-06-18 12:39:55 -04:00
parent dbc6b5b53b
commit 6ae1ff1f5c
7 changed files with 436 additions and 0 deletions
+2
View File
@@ -96,6 +96,8 @@ def search(
timings=(*response.timings, runtime_step_from_start(answer_step_name, answer_start)), timings=(*response.timings, runtime_step_from_start(answer_step_name, answer_start)),
) )
for step in response.timings:
logger.info("ebook_search_timing step=%r runtime_ms=%.1f", step.name, step.duration_ms)
logger.info( logger.info(
"ebook_search_request_complete results=%s rank_label=%s runtime_ms=%.1f", "ebook_search_request_complete results=%s rank_label=%s runtime_ms=%.1f",
len(response.results), len(response.results),
+1
View File
@@ -0,0 +1 @@
"""Offline evaluation tooling for the ebook search pipeline."""
@@ -0,0 +1,71 @@
{"query": "Who is Damien Montgomery and how does he become a Jump Mage?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What is a Rune Wright and why is Damien so rare?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "How does jump magic let starships travel faster than light?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What is the role of the Mage-King of Mars in the Protectorate?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What happened aboard the Blue Jay in the first Starship's Mage book?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "Who is Captain David Rice?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "How are amplifiers and simulacrums used to power a ship's jump?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What duties does a Hand of the Mage-King carry out?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "Explain the structure of the Royal Martian Navy.", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "How do mages carve runes to enchant a starship?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What threat do the Legatan rebels pose to the Protectorate?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "How does Damien handle his first command?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What is the significance of the simulacrum on a jump ship?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "Describe a mage duel in the Starship's Mage series.", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What moral conflicts does Damien face as a Hand of the Mage-King?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "How does the Protectorate keep peace among its member worlds?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "Who is the Keeper of Oaths and how does Damien work with them?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
{"query": "What event is known as the Onset and how does it change the world?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "Who is the main character at the start of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "How do survivors adapt after the Onset begins?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "What new abilities emerge during the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "Describe the primary antagonist in the Onset series.", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "How does society collapse and reorganize after the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "What factions form in the aftermath of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "How does the protagonist gain power throughout the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "What is the cause or origin of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "Describe an early survival challenge faced after the Onset.", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "How do the characters defend their stronghold during the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "What relationships drive the protagonist's choices in the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "How does the Onset escalate by the end of the first book?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "What mysteries about the Onset remain unresolved?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "How do the rules of the world change once the Onset takes hold?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "What weapons or tactics work best against the threats of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
{"query": "How does Bob Johansson become a von Neumann probe?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "What is a replicant and why do Bob's copies have different personalities?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "Who are Riker, Homer, and Bill among the Bob clones?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "What is GUPPI and how does Bob use it?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "Describe the threat posed by the Others.", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "How does Bob protect and uplift the Deltans?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "Why do the replicants drift apart in personality over time?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "What is the role of FAITH and the Brazilian Empire on Earth?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "How does subspace communication work for the Bobs?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "What happens to Bender after he goes missing?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "How do the Bobs build self-replicating probes across the galaxy?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "How does Bob evacuate humanity after Earth becomes uninhabitable?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "Describe the conflict between different factions of Bobs.", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "What ethical dilemmas does Bob face when interfering with primitive species?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "How does the original Bob differ from later generations of clones?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "How do the Bobs defeat the Others' system-harvesting fleets?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
{"query": "What role does Howard play in the human colonies?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
// querys not it the dataset
{"query": "How does Frodo destroy the One Ring in The Lord of the Rings?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "Who killed Dumbledore in Harry Potter and the Half-Blood Prince?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What house does Tyrion Lannister belong to in A Game of Thrones?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "How does Paul Atreides control the spice on Arrakis in Dune?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What does the green light at the end of the dock mean in The Great Gatsby?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "Why does Hester Prynne wear a scarlet letter?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What does the white whale represent in Moby-Dick?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "How does Elizabeth Bennet's view of Mr. Darcy change in Pride and Prejudice?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What crime does Raskolnikov commit in Crime and Punishment?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "How does Katniss volunteer for the Hunger Games?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What is Winston Smith's job in Nineteen Eighty-Four?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "Who is Atticus Finch defending in To Kill a Mockingbird?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What is the capital of Australia?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "How do I bake a sourdough loaf from scratch?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "Explain how photosynthesis converts sunlight into energy.", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What were the main causes of World War I?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "How does compound interest work?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "How do I change a flat tire on a car?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What is the boiling point of water at sea level?", "answer": null, "answerable": false, "relevant_sources": []}
{"query": "What is the recommended daily intake of vitamin D?", "answer": null, "answerable": false, "relevant_sources": []}
+47
View File
@@ -0,0 +1,47 @@
"""Shared query set loading for evaluation and load testing.
Each JSONL record has a ``query`` and an optional reference ``answer``. ``answerable``
marks whether the query should be answerable from the library (false for out-of-corpus
"garbage" queries used to test the refusal path). Relevance for retrieval metrics is
labeled at source (book) granularity in ``relevant_sources``; source titles must match
``ebook_source.title`` values for the indexed corpus.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from pathlib import Path
DEFAULT_QUERIES_PATH = Path(__file__).parent / "data" / "queries.jsonl"
@dataclass(frozen=True)
class GoldQuery:
"""One labeled query shared by the eval and load-test tools."""
query: str
answer: str | None
answerable: bool
relevant_sources: tuple[str, ...]
relevant_substrings: tuple[str, ...]
def load_gold_queries(path: Path = DEFAULT_QUERIES_PATH) -> list[GoldQuery]:
"""Load labeled queries from a JSONL file."""
queries: list[GoldQuery] = []
for line in path.read_text(encoding="utf-8").splitlines():
stripped = line.strip()
if not stripped:
continue
record = json.loads(stripped)
queries.append(
GoldQuery(
query=str(record["query"]),
answer=record.get("answer"),
answerable=bool(record.get("answerable", True)),
relevant_sources=tuple(record.get("relevant_sources", ())),
relevant_substrings=tuple(record.get("relevant_substrings", ())),
)
)
return queries
+218
View File
@@ -0,0 +1,218 @@
"""Load test for the EPUB search service.
Drives ``POST /search`` on a running server at a configurable concurrency and reports
latency percentiles, throughput, and HTTP status distribution. Queries are drawn from
the shared JSONL set (see ``eval/data/queries.jsonl``) that the eval also uses, so load
and evaluation exercise the same questions. Answer generation and reranking happen
server-side, so this exercises the full retrieval pipeline.
"""
from __future__ import annotations
import asyncio
import logging
import math
import random
import statistics
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Annotated
import httpx
import typer
from python.common import configure_logger
from python.ebook_search.eval.dataset import DEFAULT_QUERIES_PATH, load_gold_queries
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class RequestResult:
"""Outcome of a single search request."""
status_code: int
latency_ms: float
ok: bool
@dataclass(frozen=True)
class LoadSummary:
"""Aggregate results of a load test run."""
total: int
successes: int
failures: int
wall_seconds: float
throughput_rps: float
latency_p50_ms: float
latency_p90_ms: float
latency_p95_ms: float
latency_p99_ms: float
latency_mean_ms: float
latency_max_ms: float
status_counts: dict[int, int]
def load_queries(queries_file: str | None) -> list[str]:
"""Return the query strings from the shared JSONL set (or a custom JSONL file)."""
path = Path(queries_file) if queries_file else DEFAULT_QUERIES_PATH
queries = [gold.query for gold in load_gold_queries(path)]
if not queries:
msg = f"No queries found in {path}"
raise typer.BadParameter(msg)
return queries
def pick_query(queries: list[str]) -> str:
"""Return a uniformly random query from the pool (not a security context)."""
return random.choice(queries) # noqa: S311 load-test query sampling is not security-sensitive
def percentile(values_sorted: list[float], pct: float) -> float:
"""Return the linearly-interpolated percentile of a sorted list."""
if not values_sorted:
return 0.0
rank = (pct / 100) * (len(values_sorted) - 1)
low = math.floor(rank)
high = math.ceil(rank)
if low == high:
return values_sorted[low]
return values_sorted[low] + (values_sorted[high] - values_sorted[low]) * (rank - low)
def summarize(results: list[RequestResult], wall_seconds: float) -> LoadSummary:
"""Aggregate per-request results into a load summary."""
latencies = sorted(result.latency_ms for result in results)
successes = sum(1 for result in results if result.ok)
status_counts: dict[int, int] = {}
for result in results:
status_counts[result.status_code] = status_counts.get(result.status_code, 0) + 1
return LoadSummary(
total=len(results),
successes=successes,
failures=len(results) - successes,
wall_seconds=wall_seconds,
throughput_rps=len(results) / wall_seconds if wall_seconds > 0 else 0.0,
latency_p50_ms=percentile(latencies, 50),
latency_p90_ms=percentile(latencies, 90),
latency_p95_ms=percentile(latencies, 95),
latency_p99_ms=percentile(latencies, 99),
latency_mean_ms=statistics.fmean(latencies) if latencies else 0.0,
latency_max_ms=latencies[-1] if latencies else 0.0,
status_counts=status_counts,
)
async def send_search(client: httpx.AsyncClient, query: str, *, rerank: bool) -> RequestResult:
"""Send one search request and record its status and latency."""
data = {"query": query, "rerank": "true"} if rerank else {"query": query}
start = time.perf_counter()
try:
response = await client.post("/search", data=data)
except httpx.HTTPError as error:
logger.warning("ebook_loadtest_request_failed error=%s", error)
return RequestResult(status_code=0, latency_ms=(time.perf_counter() - start) * 1000, ok=False)
return RequestResult(
status_code=response.status_code,
latency_ms=(time.perf_counter() - start) * 1000,
ok=response.is_success,
)
async def worker(
client: httpx.AsyncClient,
queue: asyncio.Queue[str],
results: list[RequestResult],
*,
rerank: bool,
) -> None:
"""Pull queries off the queue and send requests until it is empty."""
while True:
try:
query = queue.get_nowait()
except asyncio.QueueEmpty:
return
results.append(await send_search(client, query, rerank=rerank))
async def run_load(
*,
base_url: str,
queries: list[str],
request_count: int,
concurrency: int,
rerank: bool,
warmup: int,
timeout_seconds: float,
) -> LoadSummary:
"""Run the load test and return its aggregate summary."""
limits = httpx.Limits(max_connections=concurrency, max_keepalive_connections=concurrency)
async with httpx.AsyncClient(base_url=base_url, timeout=timeout_seconds, limits=limits) as client:
for _ in range(warmup):
await send_search(client, pick_query(queries), rerank=rerank)
queue: asyncio.Queue[str] = asyncio.Queue()
for _ in range(request_count):
queue.put_nowait(pick_query(queries))
results: list[RequestResult] = []
start = time.perf_counter()
workers = [asyncio.create_task(worker(client, queue, results, rerank=rerank)) for _ in range(concurrency)]
await asyncio.gather(*workers)
wall_seconds = time.perf_counter() - start
return summarize(results, wall_seconds)
def print_summary(summary: LoadSummary) -> None:
"""Print the load summary to stdout."""
typer.echo(f"requests={summary.total} successes={summary.successes} failures={summary.failures}")
typer.echo(f"wall={summary.wall_seconds:.2f}s throughput={summary.throughput_rps:.1f} req/s")
typer.echo(
f"latency_ms p50={summary.latency_p50_ms:.1f} p90={summary.latency_p90_ms:.1f} "
f"p95={summary.latency_p95_ms:.1f} p99={summary.latency_p99_ms:.1f} "
f"mean={summary.latency_mean_ms:.1f} max={summary.latency_max_ms:.1f}"
)
status_summary = " ".join(f"{code}={count}" for code, count in sorted(summary.status_counts.items()))
typer.echo(f"status {status_summary}")
def main(
*,
base_url: Annotated[str, typer.Option(help="Base URL of the running service")] = "http://127.0.0.1:8070",
request_count: Annotated[int, typer.Option("--requests", help="Total requests to send")] = 200,
concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 10,
rerank: Annotated[bool, typer.Option(help="Request server-side reranking")] = False,
warmup: Annotated[int, typer.Option(help="Warmup requests, not measured")] = 5,
timeout_seconds: Annotated[float, typer.Option("--timeout", help="Per-request timeout seconds")] = 120.0,
queries_file: Annotated[str | None, typer.Option(help="Query JSONL file (defaults to the shared set)")] = None,
log_level: Annotated[str, typer.Option(help="Log level")] = "WARNING",
) -> None:
"""Load test the search endpoint and report latency and throughput."""
configure_logger(log_level)
queries = load_queries(queries_file)
logger.info(
"ebook_loadtest_start base_url=%s requests=%s concurrency=%s rerank=%s queries=%s",
base_url,
request_count,
concurrency,
rerank,
len(queries),
)
summary = asyncio.run(
run_load(
base_url=base_url,
queries=queries,
request_count=request_count,
concurrency=concurrency,
rerank=rerank,
warmup=warmup,
timeout_seconds=timeout_seconds,
)
)
print_summary(summary)
if __name__ == "__main__":
typer.run(main)
+18
View File
@@ -0,0 +1,18 @@
"""Tests for the shared query/gold set loader."""
from __future__ import annotations
from python.ebook_search.eval.dataset import load_gold_queries
def test_default_query_set_counts() -> None:
queries = load_gold_queries()
answerable = [query for query in queries if query.answerable]
assert len(queries) == 70
assert len(answerable) == 50
assert len(queries) - len(answerable) == 20
assert all(query.query for query in queries)
# Answerable queries carry at least one source; garbage queries carry none.
assert all(query.relevant_sources for query in answerable)
assert all(not query.relevant_sources for query in queries if not query.answerable)
+79
View File
@@ -0,0 +1,79 @@
"""Tests for the load-test runner and its statistics helpers."""
from __future__ import annotations
import asyncio
from typing import TYPE_CHECKING
import pytest
from python.ebook_search.loadtest import RequestResult, load_queries, percentile, run_load, summarize
if TYPE_CHECKING:
from pytest_mock import MockerFixture
def test_load_queries_reads_shared_set() -> None:
queries = load_queries(None)
assert len(queries) == 70
assert all(isinstance(query, str) and query for query in queries)
def test_percentile_interpolates() -> None:
values = [10.0, 20.0, 30.0, 40.0]
assert percentile(values, 50) == pytest.approx(25.0)
assert percentile(values, 90) == pytest.approx(37.0)
assert percentile(values, 0) == 10.0
assert percentile(values, 100) == 40.0
assert percentile([], 95) == 0.0
def test_summarize_counts_and_throughput() -> None:
results = [
RequestResult(status_code=200, latency_ms=10.0, ok=True),
RequestResult(status_code=200, latency_ms=20.0, ok=True),
RequestResult(status_code=200, latency_ms=30.0, ok=True),
RequestResult(status_code=500, latency_ms=40.0, ok=False),
]
summary = summarize(results, wall_seconds=2.0)
assert summary.total == 4
assert summary.successes == 3
assert summary.failures == 1
assert summary.throughput_rps == pytest.approx(2.0)
assert summary.latency_max_ms == 40.0
assert summary.status_counts == {200: 3, 500: 1}
def test_summarize_handles_empty() -> None:
summary = summarize([], wall_seconds=0.0)
assert summary.total == 0
assert summary.throughput_rps == 0.0
assert summary.latency_p95_ms == 0.0
def test_run_load_aggregates_mocked_responses(mocker: MockerFixture) -> None:
response = mocker.Mock(status_code=200, is_success=True)
client = mocker.MagicMock()
client.__aenter__.return_value = client
client.post = mocker.AsyncMock(return_value=response)
mocker.patch("python.ebook_search.loadtest.httpx.AsyncClient", return_value=client)
summary = asyncio.run(
run_load(
base_url="http://test",
queries=["q1", "q2"],
request_count=4,
concurrency=2,
rerank=False,
warmup=1,
timeout_seconds=1.0,
)
)
assert summary.total == 4
assert summary.successes == 4
assert summary.failures == 0
assert summary.status_counts == {200: 4}
# 1 warmup request (not measured) plus 4 measured requests.
assert client.post.await_count == 5