added bill_token_compression.py

tested on sample size of 100 bills matching the distribution of our data Compression saves ~11.5% on prompt tokens; completion/reasoning are roughly equal across the two sets. prompt completion reasoning total compressed 349,460 157,110 112,128 506,570 uncompressed 394,948 154,710 110,080 549,658 delta −45,488 +2,400 +2,048 −43,088
2026-04-17 21:18:18 -04:00 · 2026-04-09 18:41:13 -04:00
parent d17c883476
commit 251da6c14a
2 changed files with 432 additions and 0 deletions
--- a/python/prompt_bench/bill_token_compression.py
+++ b/python/prompt_bench/bill_token_compression.py
@@ -0,0 +1,162 @@
 """Lossless-ish text compression for Congressional bill text."""
 from __future__ import annotations
 import re
 STATES = (
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming",
    "Puerto Rico",
    "Guam",
    "American Samoa",
    "District of Columbia",
    "US Virgin Islands",
 )
 STATE_PATTERNS = [(re.compile(re.escape(state), re.IGNORECASE), state) for state in STATES]
 def normalize_state_names(text: str) -> str:
    """Replace any casing of state names with title case."""
    for pattern, replacement in STATE_PATTERNS:
        text = pattern.sub(replacement, text)
    return text
 def strip_number_commas(text: str) -> str:
    """Remove commas from numeric thousands separators."""
    return re.sub(r"(\d{1,3}(?:,\d{3})+)", lambda match: match.group().replace(",", ""), text)
 def strip_horizontal_rules(text: str) -> str:
    """Remove ASCII horizontal-rule lines built from underscores, dashes, equals, or asterisks."""
    return re.sub(r"^\s*[_\-=\*]{3,}\s*$", "", text, flags=re.MULTILINE)
 def collapse_double_dashes(text: str) -> str:
    """Replace ``--`` em-dash stand-ins with a single space so they don't tokenize oddly."""
    return text.replace("--", " ")
 def collapse_inline_whitespace(text: str) -> str:
    """Collapse runs of horizontal whitespace (spaces, tabs) into a single space, leaving newlines intact."""
    return re.sub(r"[^\S\n]+", " ", text)
 def collapse_blank_lines(text: str) -> str:
    """Collapse three-or-more consecutive newlines down to a blank-line separator."""
    return re.sub(r"\n{3,}", "\n\n", text)
 def trim_line_edges(text: str) -> str:
    """Strip spaces immediately before and after newline characters on every line."""
    text = re.sub(r" +\n", "\n", text)
    return re.sub(r"\n +", "\n", text)
 def shorten_section_markers(text: str) -> str:
    """Rewrite ``Sec. 12.`` style section headings as the more compact ``SEC 12``."""
    return re.sub(r"(?i)sec\.\s*(\d+[a-zA-Z]?)\.", r"SEC \1", text)
 def unwrap_parens(text: str) -> str:
    """Strip parentheses around short alphanumeric labels like ``(a)`` or ``(12)``."""
    return re.sub(r"\(([a-zA-Z0-9]+)\)", r"\1", text)
 def strip_typeset_quotes(text: str) -> str:
    """Remove the `` and '' typeset quote markers used in the GPO bill format."""
    return text.replace("``", "").replace("''", "")
 def normalize_usc_acronym(text: str) -> str:
    """Collapse ``U.S.C.`` to ``USC`` to save tokens on the common citation."""
    return text.replace("U.S.C.", "USC")
 def normalize_us_acronym(text: str) -> str:
    """Normalize the various ``U.S.``/``U. S.`` spellings to the bare ``US`` form."""
    for acronym in ("U. S.", "u. s.", "U.S. ", "u.s. "):
        text = text.replace(acronym, "US ")
    return text
 def collapse_ellipses(text: str) -> str:
    """Collapse runs of two-or-more periods (``...``, ``....``) down to a single period."""
    return re.sub(r"\.{2,}", ".", text)
 COMPRESSION_STEPS = (
    strip_horizontal_rules,
    collapse_double_dashes,
    collapse_inline_whitespace,
    collapse_blank_lines,
    trim_line_edges,
    shorten_section_markers,
    unwrap_parens,
    strip_typeset_quotes,
    normalize_usc_acronym,
    normalize_us_acronym,
    strip_number_commas,
    collapse_ellipses,
    normalize_state_names,
 )
 def compress_bill_text(text: str) -> str:
    """Apply lossless-ish whitespace and boilerplate compression to bill text.
    Runs every transform in :data:`COMPRESSION_STEPS` in order, then strips
    leading/trailing whitespace from the final result.
    """
    for step in COMPRESSION_STEPS:
        text = step(text)
    return text.strip()
--- a/python/prompt_bench/compresion_test.py
+++ b/python/prompt_bench/compresion_test.py
@@ -0,0 +1,270 @@
 """Run two interactive OpenAI chat-completion sweeps over bill text.
 Reads the first N bills from a CSV with a `text_content` column and sends two
 sweeps through `/v1/chat/completions` concurrently — one with the raw bill
 text, one with the compressed bill text. Each request's prompt is saved to
 disk alongside the OpenAI response id so the prompts and responses can be
 correlated later.
 """
 from __future__ import annotations
 import csv
 import json
 import logging
 import re
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from os import getenv
 from pathlib import Path
 from typing import Annotated
 import httpx
 import typer
 from python.prompt_bench.bill_token_compression import compress_bill_text
 logger = logging.getLogger(__name__)
 OPENAI_API_BASE = "https://api.openai.com/v1"
 DEFAULT_MODEL = "gpt-5.4-mini"
 DEFAULT_COUNT = 100
 SEED = 42
 SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text.
 Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections.
 EXTRACTION RULES:
 - IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate.
 - FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH.
 - SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them.
 - BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains).
 - STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does.
 OUTPUT FORMAT — plain structured text, not JSON:
 OPERATIVE ACTIONS:
 [Numbered list of what the bill actually does, one action per line, max 20 words each]
 AFFECTED POPULATIONS:
 [Who gains something, who loses something, or whose behavior is regulated]
 MECHANISMS:
 [How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.]
 POLICY THREADS:
 [List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.]
 SYMBOLIC/PROCEDURAL ONLY:
 [Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?]
 LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness."""
 USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions.
 BILL TEXT:
 {text_content}"""
 def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]:
    """Return up to `count` (bill_id, text_content) tuples with non-empty text."""
    csv.field_size_limit(sys.maxsize)
    bills: list[tuple[str, str]] = []
    with csv_path.open(newline="", encoding="utf-8") as handle:
        reader = csv.DictReader(handle)
        for row in reader:
            text_content = (row.get("text_content") or "").strip()
            if not text_content:
                continue
            bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
            version_code = row.get("version_code") or ""
            unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
            bills.append((unique_id, text_content))
            if len(bills) >= count:
                break
    return bills
 def build_messages(bill_text: str) -> list[dict]:
    """Return the system + user message pair for a bill."""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_TEMPLATE.format(text_content=bill_text)},
    ]
 def safe_filename(value: str) -> str:
    """Make a string safe for use as a filename."""
    return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"
 def run_one_request(
    client: httpx.Client,
    *,
    bill_id: str,
    label: str,
    bill_text: str,
    model: str,
    output_path: Path,
 ) -> tuple[bool, float, str | None]:
    """Send one chat-completion request and persist prompt + response.
    Returns (success, elapsed_seconds, response_id).
    """
    messages = build_messages(bill_text)
    payload = {
        "model": model,
        "messages": messages,
        "seed": SEED,
    }
    start = time.monotonic()
    record: dict = {
        "bill_id": bill_id,
        "label": label,
        "model": model,
        "seed": SEED,
        "input_chars": len(bill_text),
        "messages": messages,
    }
    try:
        response = client.post(f"{OPENAI_API_BASE}/chat/completions", json=payload)
        response.raise_for_status()
        body = response.json()
    except httpx.HTTPStatusError as error:
        elapsed = time.monotonic() - start
        record["error"] = {
            "status_code": error.response.status_code,
            "body": error.response.text,
            "elapsed_seconds": elapsed,
        }
        output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
        logger.exception("HTTP error for %s/%s after %.2fs", label, bill_id, elapsed)
        return False, elapsed, None
    except Exception as error:
        elapsed = time.monotonic() - start
        record["error"] = {"message": str(error), "elapsed_seconds": elapsed}
        output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
        logger.exception("Failed: %s/%s after %.2fs", label, bill_id, elapsed)
        return False, elapsed, None
    elapsed = time.monotonic() - start
    response_id = body.get("id")
    record["response_id"] = response_id
    record["elapsed_seconds"] = elapsed
    record["usage"] = body.get("usage")
    record["response"] = body
    output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
    logger.info("Done: %s/%s id=%s in %.2fs", label, bill_id, response_id, elapsed)
    return True, elapsed, response_id
 def main(
    csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
        "output/openai_runs",
    ),
    model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
    count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
    concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
    if not api_key:
        message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
        raise typer.BadParameter(message)
    if not csv_path.is_file():
        message = f"CSV not found: {csv_path}"
        raise typer.BadParameter(message)
    compressed_dir = output_dir / "compressed"
    uncompressed_dir = output_dir / "uncompressed"
    compressed_dir.mkdir(parents=True, exist_ok=True)
    uncompressed_dir.mkdir(parents=True, exist_ok=True)
    logger.info("Loading %d bills from %s", count, csv_path)
    bills = load_bills(csv_path, count)
    if len(bills) < count:
        logger.warning("Only %d bills available (requested %d)", len(bills), count)
    tasks: list[tuple[str, str, str, Path]] = []
    for bill_id, text_content in bills:
        filename = f"{safe_filename(bill_id)}.json"
        tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
        tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
    logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)
    headers = {"Authorization": f"Bearer {api_key}"}
    completed = 0
    failed = 0
    index: list[dict] = []
    wall_start = time.monotonic()
    with (
        httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client,
        ThreadPoolExecutor(
            max_workers=concurrency,
        ) as executor,
    ):
        future_to_task = {
            executor.submit(
                run_one_request,
                client,
                bill_id=bill_id,
                label=label,
                bill_text=bill_text,
                model=model,
                output_path=output_path,
            ): (bill_id, label, output_path)
            for bill_id, label, bill_text, output_path in tasks
        }
        for future in as_completed(future_to_task):
            bill_id, label, output_path = future_to_task[future]
            success, elapsed, response_id = future.result()
            if success:
                completed += 1
            else:
                failed += 1
            index.append(
                {
                    "bill_id": bill_id,
                    "label": label,
                    "response_id": response_id,
                    "elapsed_seconds": elapsed,
                    "success": success,
                    "path": str(output_path),
                },
            )
    wall_elapsed = time.monotonic() - wall_start
    summary = {
        "model": model,
        "count": len(bills),
        "completed": completed,
        "failed": failed,
        "wall_seconds": wall_elapsed,
        "concurrency": concurrency,
        "results": index,
    }
    summary_path = output_dir / "summary.json"
    summary_path.write_text(json.dumps(summary, indent=2))
    logger.info(
        "Done: completed=%d failed=%d wall=%.1fs summary=%s",
        completed,
        failed,
        wall_elapsed,
        summary_path,
    )
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()