feat(ebook-search): add load-test CLI for the search service

Add a Typer CLI script that drives POST /search on a running server at a configurable concurrency and reports latency percentiles (p50/p90/p95/p99), throughput, and HTTP status distribution. Queries are drawn from the shared eval JSONL set so load testing and evaluation exercise the same questions.
2026-06-18 12:39:55 -04:00
parent dbc6b5b53b
commit 6ae1ff1f5c
7 changed files with 436 additions and 0 deletions
@@ -96,6 +96,8 @@ def search(
        timings=(*response.timings, runtime_step_from_start(answer_step_name, answer_start)),
    )
    for step in response.timings:
        logger.info("ebook_search_timing step=%r runtime_ms=%.1f", step.name, step.duration_ms)
    logger.info(
        "ebook_search_request_complete results=%s rank_label=%s runtime_ms=%.1f",
        len(response.results),
@@ -0,0 +1 @@
 """Offline evaluation tooling for the ebook search pipeline."""
@@ -0,0 +1,71 @@
 {"query": "Who is Damien Montgomery and how does he become a Jump Mage?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What is a Rune Wright and why is Damien so rare?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "How does jump magic let starships travel faster than light?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What is the role of the Mage-King of Mars in the Protectorate?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What happened aboard the Blue Jay in the first Starship's Mage book?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "Who is Captain David Rice?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "How are amplifiers and simulacrums used to power a ship's jump?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What duties does a Hand of the Mage-King carry out?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "Explain the structure of the Royal Martian Navy.", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "How do mages carve runes to enchant a starship?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What threat do the Legatan rebels pose to the Protectorate?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "How does Damien handle his first command?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What is the significance of the simulacrum on a jump ship?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "Describe a mage duel in the Starship's Mage series.", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What moral conflicts does Damien face as a Hand of the Mage-King?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "How does the Protectorate keep peace among its member worlds?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "Who is the Keeper of Oaths and how does Damien work with them?", "answer": null, "answerable": true, "relevant_sources": ["Starship's Mage"]}
 {"query": "What event is known as the Onset and how does it change the world?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "Who is the main character at the start of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "How do survivors adapt after the Onset begins?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "What new abilities emerge during the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "Describe the primary antagonist in the Onset series.", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "How does society collapse and reorganize after the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "What factions form in the aftermath of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "How does the protagonist gain power throughout the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "What is the cause or origin of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "Describe an early survival challenge faced after the Onset.", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "How do the characters defend their stronghold during the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "What relationships drive the protagonist's choices in the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "How does the Onset escalate by the end of the first book?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "What mysteries about the Onset remain unresolved?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "How do the rules of the world change once the Onset takes hold?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "What weapons or tactics work best against the threats of the Onset?", "answer": null, "answerable": true, "relevant_sources": ["The Onset"]}
 {"query": "How does Bob Johansson become a von Neumann probe?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "What is a replicant and why do Bob's copies have different personalities?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "Who are Riker, Homer, and Bill among the Bob clones?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "What is GUPPI and how does Bob use it?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "Describe the threat posed by the Others.", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "How does Bob protect and uplift the Deltans?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "Why do the replicants drift apart in personality over time?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "What is the role of FAITH and the Brazilian Empire on Earth?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "How does subspace communication work for the Bobs?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "What happens to Bender after he goes missing?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "How do the Bobs build self-replicating probes across the galaxy?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "How does Bob evacuate humanity after Earth becomes uninhabitable?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "Describe the conflict between different factions of Bobs.", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "What ethical dilemmas does Bob face when interfering with primitive species?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "How does the original Bob differ from later generations of clones?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "How do the Bobs defeat the Others' system-harvesting fleets?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 {"query": "What role does Howard play in the human colonies?", "answer": null, "answerable": true, "relevant_sources": ["We Are Legion (We Are Bob)"]}
 // querys not it the dataset
 {"query": "How does Frodo destroy the One Ring in The Lord of the Rings?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "Who killed Dumbledore in Harry Potter and the Half-Blood Prince?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What house does Tyrion Lannister belong to in A Game of Thrones?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "How does Paul Atreides control the spice on Arrakis in Dune?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What does the green light at the end of the dock mean in The Great Gatsby?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "Why does Hester Prynne wear a scarlet letter?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What does the white whale represent in Moby-Dick?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "How does Elizabeth Bennet's view of Mr. Darcy change in Pride and Prejudice?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What crime does Raskolnikov commit in Crime and Punishment?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "How does Katniss volunteer for the Hunger Games?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What is Winston Smith's job in Nineteen Eighty-Four?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "Who is Atticus Finch defending in To Kill a Mockingbird?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What is the capital of Australia?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "How do I bake a sourdough loaf from scratch?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "Explain how photosynthesis converts sunlight into energy.", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What were the main causes of World War I?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "How does compound interest work?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "How do I change a flat tire on a car?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What is the boiling point of water at sea level?", "answer": null, "answerable": false, "relevant_sources": []}
 {"query": "What is the recommended daily intake of vitamin D?", "answer": null, "answerable": false, "relevant_sources": []}
@@ -0,0 +1,47 @@
 """Shared query set loading for evaluation and load testing.
 Each JSONL record has a ``query`` and an optional reference ``answer``. ``answerable``
 marks whether the query should be answerable from the library (false for out-of-corpus
 "garbage" queries used to test the refusal path). Relevance for retrieval metrics is
 labeled at source (book) granularity in ``relevant_sources``; source titles must match
 ``ebook_source.title`` values for the indexed corpus.
 """
 from __future__ import annotations
 import json
 from dataclasses import dataclass
 from pathlib import Path
 DEFAULT_QUERIES_PATH = Path(__file__).parent / "data" / "queries.jsonl"
@dataclass(frozen=True)
 class GoldQuery:
    """One labeled query shared by the eval and load-test tools."""
    query: str
    answer: str | None
    answerable: bool
    relevant_sources: tuple[str, ...]
    relevant_substrings: tuple[str, ...]
 def load_gold_queries(path: Path = DEFAULT_QUERIES_PATH) -> list[GoldQuery]:
    """Load labeled queries from a JSONL file."""
    queries: list[GoldQuery] = []
    for line in path.read_text(encoding="utf-8").splitlines():
        stripped = line.strip()
        if not stripped:
            continue
        record = json.loads(stripped)
        queries.append(
            GoldQuery(
                query=str(record["query"]),
                answer=record.get("answer"),
                answerable=bool(record.get("answerable", True)),
                relevant_sources=tuple(record.get("relevant_sources", ())),
                relevant_substrings=tuple(record.get("relevant_substrings", ())),
            )
        )
    return queries
@@ -0,0 +1,218 @@
 """Load test for the EPUB search service.
 Drives ``POST /search`` on a running server at a configurable concurrency and reports
 latency percentiles, throughput, and HTTP status distribution. Queries are drawn from
 the shared JSONL set (see ``eval/data/queries.jsonl``) that the eval also uses, so load
 and evaluation exercise the same questions. Answer generation and reranking happen
 server-side, so this exercises the full retrieval pipeline.
 """
 from __future__ import annotations
 import asyncio
 import logging
 import math
 import random
 import statistics
 import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated
 import httpx
 import typer
 from python.common import configure_logger
 from python.ebook_search.eval.dataset import DEFAULT_QUERIES_PATH, load_gold_queries
 logger = logging.getLogger(__name__)
@dataclass(frozen=True)
 class RequestResult:
    """Outcome of a single search request."""
    status_code: int
    latency_ms: float
    ok: bool
@dataclass(frozen=True)
 class LoadSummary:
    """Aggregate results of a load test run."""
    total: int
    successes: int
    failures: int
    wall_seconds: float
    throughput_rps: float
    latency_p50_ms: float
    latency_p90_ms: float
    latency_p95_ms: float
    latency_p99_ms: float
    latency_mean_ms: float
    latency_max_ms: float
    status_counts: dict[int, int]
 def load_queries(queries_file: str | None) -> list[str]:
    """Return the query strings from the shared JSONL set (or a custom JSONL file)."""
    path = Path(queries_file) if queries_file else DEFAULT_QUERIES_PATH
    queries = [gold.query for gold in load_gold_queries(path)]
    if not queries:
        msg = f"No queries found in {path}"
        raise typer.BadParameter(msg)
    return queries
 def pick_query(queries: list[str]) -> str:
    """Return a uniformly random query from the pool (not a security context)."""
    return random.choice(queries)  # noqa: S311 load-test query sampling is not security-sensitive
 def percentile(values_sorted: list[float], pct: float) -> float:
    """Return the linearly-interpolated percentile of a sorted list."""
    if not values_sorted:
        return 0.0
    rank = (pct / 100) * (len(values_sorted) - 1)
    low = math.floor(rank)
    high = math.ceil(rank)
    if low == high:
        return values_sorted[low]
    return values_sorted[low] + (values_sorted[high] - values_sorted[low]) * (rank - low)
 def summarize(results: list[RequestResult], wall_seconds: float) -> LoadSummary:
    """Aggregate per-request results into a load summary."""
    latencies = sorted(result.latency_ms for result in results)
    successes = sum(1 for result in results if result.ok)
    status_counts: dict[int, int] = {}
    for result in results:
        status_counts[result.status_code] = status_counts.get(result.status_code, 0) + 1
    return LoadSummary(
        total=len(results),
        successes=successes,
        failures=len(results) - successes,
        wall_seconds=wall_seconds,
        throughput_rps=len(results) / wall_seconds if wall_seconds > 0 else 0.0,
        latency_p50_ms=percentile(latencies, 50),
        latency_p90_ms=percentile(latencies, 90),
        latency_p95_ms=percentile(latencies, 95),
        latency_p99_ms=percentile(latencies, 99),
        latency_mean_ms=statistics.fmean(latencies) if latencies else 0.0,
        latency_max_ms=latencies[-1] if latencies else 0.0,
        status_counts=status_counts,
    )
 async def send_search(client: httpx.AsyncClient, query: str, *, rerank: bool) -> RequestResult:
    """Send one search request and record its status and latency."""
    data = {"query": query, "rerank": "true"} if rerank else {"query": query}
    start = time.perf_counter()
    try:
        response = await client.post("/search", data=data)
    except httpx.HTTPError as error:
        logger.warning("ebook_loadtest_request_failed error=%s", error)
        return RequestResult(status_code=0, latency_ms=(time.perf_counter() - start) * 1000, ok=False)
    return RequestResult(
        status_code=response.status_code,
        latency_ms=(time.perf_counter() - start) * 1000,
        ok=response.is_success,
    )
 async def worker(
    client: httpx.AsyncClient,
    queue: asyncio.Queue[str],
    results: list[RequestResult],
    *,
    rerank: bool,
 ) -> None:
    """Pull queries off the queue and send requests until it is empty."""
    while True:
        try:
            query = queue.get_nowait()
        except asyncio.QueueEmpty:
            return
        results.append(await send_search(client, query, rerank=rerank))
 async def run_load(
    *,
    base_url: str,
    queries: list[str],
    request_count: int,
    concurrency: int,
    rerank: bool,
    warmup: int,
    timeout_seconds: float,
 ) -> LoadSummary:
    """Run the load test and return its aggregate summary."""
    limits = httpx.Limits(max_connections=concurrency, max_keepalive_connections=concurrency)
    async with httpx.AsyncClient(base_url=base_url, timeout=timeout_seconds, limits=limits) as client:
        for _ in range(warmup):
            await send_search(client, pick_query(queries), rerank=rerank)
        queue: asyncio.Queue[str] = asyncio.Queue()
        for _ in range(request_count):
            queue.put_nowait(pick_query(queries))
        results: list[RequestResult] = []
        start = time.perf_counter()
        workers = [asyncio.create_task(worker(client, queue, results, rerank=rerank)) for _ in range(concurrency)]
        await asyncio.gather(*workers)
        wall_seconds = time.perf_counter() - start
    return summarize(results, wall_seconds)
 def print_summary(summary: LoadSummary) -> None:
    """Print the load summary to stdout."""
    typer.echo(f"requests={summary.total} successes={summary.successes} failures={summary.failures}")
    typer.echo(f"wall={summary.wall_seconds:.2f}s throughput={summary.throughput_rps:.1f} req/s")
    typer.echo(
        f"latency_ms p50={summary.latency_p50_ms:.1f} p90={summary.latency_p90_ms:.1f} "
        f"p95={summary.latency_p95_ms:.1f} p99={summary.latency_p99_ms:.1f} "
        f"mean={summary.latency_mean_ms:.1f} max={summary.latency_max_ms:.1f}"
    )
    status_summary = " ".join(f"{code}={count}" for code, count in sorted(summary.status_counts.items()))
    typer.echo(f"status {status_summary}")
 def main(
    *,
    base_url: Annotated[str, typer.Option(help="Base URL of the running service")] = "http://127.0.0.1:8070",
    request_count: Annotated[int, typer.Option("--requests", help="Total requests to send")] = 200,
    concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 10,
    rerank: Annotated[bool, typer.Option(help="Request server-side reranking")] = False,
    warmup: Annotated[int, typer.Option(help="Warmup requests, not measured")] = 5,
    timeout_seconds: Annotated[float, typer.Option("--timeout", help="Per-request timeout seconds")] = 120.0,
    queries_file: Annotated[str | None, typer.Option(help="Query JSONL file (defaults to the shared set)")] = None,
    log_level: Annotated[str, typer.Option(help="Log level")] = "WARNING",
 ) -> None:
    """Load test the search endpoint and report latency and throughput."""
    configure_logger(log_level)
    queries = load_queries(queries_file)
    logger.info(
        "ebook_loadtest_start base_url=%s requests=%s concurrency=%s rerank=%s queries=%s",
        base_url,
        request_count,
        concurrency,
        rerank,
        len(queries),
    )
    summary = asyncio.run(
        run_load(
            base_url=base_url,
            queries=queries,
            request_count=request_count,
            concurrency=concurrency,
            rerank=rerank,
            warmup=warmup,
            timeout_seconds=timeout_seconds,
        )
    )
    print_summary(summary)
 if __name__ == "__main__":
    typer.run(main)
@@ -0,0 +1,18 @@
 """Tests for the shared query/gold set loader."""
 from __future__ import annotations
 from python.ebook_search.eval.dataset import load_gold_queries
 def test_default_query_set_counts() -> None:
    queries = load_gold_queries()
    answerable = [query for query in queries if query.answerable]
    assert len(queries) == 70
    assert len(answerable) == 50
    assert len(queries) - len(answerable) == 20
    assert all(query.query for query in queries)
    # Answerable queries carry at least one source; garbage queries carry none.
    assert all(query.relevant_sources for query in answerable)
    assert all(not query.relevant_sources for query in queries if not query.answerable)
@@ -0,0 +1,79 @@
 """Tests for the load-test runner and its statistics helpers."""
 from __future__ import annotations
 import asyncio
 from typing import TYPE_CHECKING
 import pytest
 from python.ebook_search.loadtest import RequestResult, load_queries, percentile, run_load, summarize
 if TYPE_CHECKING:
    from pytest_mock import MockerFixture
 def test_load_queries_reads_shared_set() -> None:
    queries = load_queries(None)
    assert len(queries) == 70
    assert all(isinstance(query, str) and query for query in queries)
 def test_percentile_interpolates() -> None:
    values = [10.0, 20.0, 30.0, 40.0]
    assert percentile(values, 50) == pytest.approx(25.0)
    assert percentile(values, 90) == pytest.approx(37.0)
    assert percentile(values, 0) == 10.0
    assert percentile(values, 100) == 40.0
    assert percentile([], 95) == 0.0
 def test_summarize_counts_and_throughput() -> None:
    results = [
        RequestResult(status_code=200, latency_ms=10.0, ok=True),
        RequestResult(status_code=200, latency_ms=20.0, ok=True),
        RequestResult(status_code=200, latency_ms=30.0, ok=True),
        RequestResult(status_code=500, latency_ms=40.0, ok=False),
    ]
    summary = summarize(results, wall_seconds=2.0)
    assert summary.total == 4
    assert summary.successes == 3
    assert summary.failures == 1
    assert summary.throughput_rps == pytest.approx(2.0)
    assert summary.latency_max_ms == 40.0
    assert summary.status_counts == {200: 3, 500: 1}
 def test_summarize_handles_empty() -> None:
    summary = summarize([], wall_seconds=0.0)
    assert summary.total == 0
    assert summary.throughput_rps == 0.0
    assert summary.latency_p95_ms == 0.0
 def test_run_load_aggregates_mocked_responses(mocker: MockerFixture) -> None:
    response = mocker.Mock(status_code=200, is_success=True)
    client = mocker.MagicMock()
    client.__aenter__.return_value = client
    client.post = mocker.AsyncMock(return_value=response)
    mocker.patch("python.ebook_search.loadtest.httpx.AsyncClient", return_value=client)
    summary = asyncio.run(
        run_load(
            base_url="http://test",
            queries=["q1", "q2"],
            request_count=4,
            concurrency=2,
            rerank=False,
            warmup=1,
            timeout_seconds=1.0,
        )
    )
    assert summary.total == 4
    assert summary.successes == 4
    assert summary.failures == 0
    assert summary.status_counts == {200: 4}
    # 1 warmup request (not measured) plus 4 measured requests.
    assert client.post.await_count == 5
		`@@ -0,0 +1 @@`
							`"""Offline evaluation tooling for the ebook search pipeline."""`