From 251da6c14afef848146944b624e850e4c307979a Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Thu, 9 Apr 2026 18:41:13 -0400 Subject: [PATCH] added bill_token_compression.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tested on sample size of 100 bills matching the distribution of our data Compression saves ~11.5% on prompt tokens; completion/reasoning are roughly equal across the two sets. prompt completion reasoning total compressed 349,460 157,110 112,128 506,570 uncompressed 394,948 154,710 110,080 549,658 delta −45,488 +2,400 +2,048 −43,088 --- python/prompt_bench/bill_token_compression.py | 162 +++++++++++ python/prompt_bench/compresion_test.py | 270 ++++++++++++++++++ 2 files changed, 432 insertions(+) create mode 100644 python/prompt_bench/bill_token_compression.py create mode 100644 python/prompt_bench/compresion_test.py diff --git a/python/prompt_bench/bill_token_compression.py b/python/prompt_bench/bill_token_compression.py new file mode 100644 index 0000000..9118ea7 --- /dev/null +++ b/python/prompt_bench/bill_token_compression.py @@ -0,0 +1,162 @@ +"""Lossless-ish text compression for Congressional bill text.""" + +from __future__ import annotations + +import re + +STATES = ( + "Alabama", + "Alaska", + "Arizona", + "Arkansas", + "California", + "Colorado", + "Connecticut", + "Delaware", + "Florida", + "Georgia", + "Hawaii", + "Idaho", + "Illinois", + "Indiana", + "Iowa", + "Kansas", + "Kentucky", + "Louisiana", + "Maine", + "Maryland", + "Massachusetts", + "Michigan", + "Minnesota", + "Mississippi", + "Missouri", + "Montana", + "Nebraska", + "Nevada", + "New Hampshire", + "New Jersey", + "New Mexico", + "New York", + "North Carolina", + "North Dakota", + "Ohio", + "Oklahoma", + "Oregon", + "Pennsylvania", + "Rhode Island", + "South Carolina", + "South Dakota", + "Tennessee", + "Texas", + "Utah", + "Vermont", + "Virginia", + "Washington", + "West Virginia", + "Wisconsin", + "Wyoming", + "Puerto Rico", + "Guam", + "American Samoa", + "District of Columbia", + "US Virgin Islands", +) +STATE_PATTERNS = [(re.compile(re.escape(state), re.IGNORECASE), state) for state in STATES] + + +def normalize_state_names(text: str) -> str: + """Replace any casing of state names with title case.""" + for pattern, replacement in STATE_PATTERNS: + text = pattern.sub(replacement, text) + return text + + +def strip_number_commas(text: str) -> str: + """Remove commas from numeric thousands separators.""" + return re.sub(r"(\d{1,3}(?:,\d{3})+)", lambda match: match.group().replace(",", ""), text) + + +def strip_horizontal_rules(text: str) -> str: + """Remove ASCII horizontal-rule lines built from underscores, dashes, equals, or asterisks.""" + return re.sub(r"^\s*[_\-=\*]{3,}\s*$", "", text, flags=re.MULTILINE) + + +def collapse_double_dashes(text: str) -> str: + """Replace ``--`` em-dash stand-ins with a single space so they don't tokenize oddly.""" + return text.replace("--", " ") + + +def collapse_inline_whitespace(text: str) -> str: + """Collapse runs of horizontal whitespace (spaces, tabs) into a single space, leaving newlines intact.""" + return re.sub(r"[^\S\n]+", " ", text) + + +def collapse_blank_lines(text: str) -> str: + """Collapse three-or-more consecutive newlines down to a blank-line separator.""" + return re.sub(r"\n{3,}", "\n\n", text) + + +def trim_line_edges(text: str) -> str: + """Strip spaces immediately before and after newline characters on every line.""" + text = re.sub(r" +\n", "\n", text) + return re.sub(r"\n +", "\n", text) + + +def shorten_section_markers(text: str) -> str: + """Rewrite ``Sec. 12.`` style section headings as the more compact ``SEC 12``.""" + return re.sub(r"(?i)sec\.\s*(\d+[a-zA-Z]?)\.", r"SEC \1", text) + + +def unwrap_parens(text: str) -> str: + """Strip parentheses around short alphanumeric labels like ``(a)`` or ``(12)``.""" + return re.sub(r"\(([a-zA-Z0-9]+)\)", r"\1", text) + + +def strip_typeset_quotes(text: str) -> str: + """Remove the `` and '' typeset quote markers used in the GPO bill format.""" + return text.replace("``", "").replace("''", "") + + +def normalize_usc_acronym(text: str) -> str: + """Collapse ``U.S.C.`` to ``USC`` to save tokens on the common citation.""" + return text.replace("U.S.C.", "USC") + + +def normalize_us_acronym(text: str) -> str: + """Normalize the various ``U.S.``/``U. S.`` spellings to the bare ``US`` form.""" + for acronym in ("U. S.", "u. s.", "U.S. ", "u.s. "): + text = text.replace(acronym, "US ") + return text + + +def collapse_ellipses(text: str) -> str: + """Collapse runs of two-or-more periods (``...``, ``....``) down to a single period.""" + return re.sub(r"\.{2,}", ".", text) + + +COMPRESSION_STEPS = ( + strip_horizontal_rules, + collapse_double_dashes, + collapse_inline_whitespace, + collapse_blank_lines, + trim_line_edges, + shorten_section_markers, + unwrap_parens, + strip_typeset_quotes, + normalize_usc_acronym, + normalize_us_acronym, + strip_number_commas, + collapse_ellipses, + normalize_state_names, +) + + +def compress_bill_text(text: str) -> str: + """Apply lossless-ish whitespace and boilerplate compression to bill text. + + Runs every transform in :data:`COMPRESSION_STEPS` in order, then strips + leading/trailing whitespace from the final result. + """ + for step in COMPRESSION_STEPS: + text = step(text) + return text.strip() diff --git a/python/prompt_bench/compresion_test.py b/python/prompt_bench/compresion_test.py new file mode 100644 index 0000000..b6d4b4e --- /dev/null +++ b/python/prompt_bench/compresion_test.py @@ -0,0 +1,270 @@ +"""Run two interactive OpenAI chat-completion sweeps over bill text. + +Reads the first N bills from a CSV with a `text_content` column and sends two +sweeps through `/v1/chat/completions` concurrently — one with the raw bill +text, one with the compressed bill text. Each request's prompt is saved to +disk alongside the OpenAI response id so the prompts and responses can be +correlated later. +""" + +from __future__ import annotations + +import csv +import json +import logging +import re +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from os import getenv +from pathlib import Path +from typing import Annotated + +import httpx +import typer + +from python.prompt_bench.bill_token_compression import compress_bill_text + +logger = logging.getLogger(__name__) + +OPENAI_API_BASE = "https://api.openai.com/v1" +DEFAULT_MODEL = "gpt-5.4-mini" +DEFAULT_COUNT = 100 +SEED = 42 + +SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text. + +Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections. + +EXTRACTION RULES: +- IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate. +- FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH. +- SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them. +- BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains). +- STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does. + +OUTPUT FORMAT — plain structured text, not JSON: + +OPERATIVE ACTIONS: +[Numbered list of what the bill actually does, one action per line, max 20 words each] + +AFFECTED POPULATIONS: +[Who gains something, who loses something, or whose behavior is regulated] + +MECHANISMS: +[How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.] + +POLICY THREADS: +[List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.] + +SYMBOLIC/PROCEDURAL ONLY: +[Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?] + +LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness.""" + +USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions. + +BILL TEXT: +{text_content}""" + + +def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]: + """Return up to `count` (bill_id, text_content) tuples with non-empty text.""" + csv.field_size_limit(sys.maxsize) + bills: list[tuple[str, str]] = [] + with csv_path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle) + for row in reader: + text_content = (row.get("text_content") or "").strip() + if not text_content: + continue + bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}" + version_code = row.get("version_code") or "" + unique_id = f"{bill_id}-{version_code}" if version_code else bill_id + bills.append((unique_id, text_content)) + if len(bills) >= count: + break + return bills + + +def build_messages(bill_text: str) -> list[dict]: + """Return the system + user message pair for a bill.""" + return [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": USER_TEMPLATE.format(text_content=bill_text)}, + ] + + +def safe_filename(value: str) -> str: + """Make a string safe for use as a filename.""" + return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed" + + +def run_one_request( + client: httpx.Client, + *, + bill_id: str, + label: str, + bill_text: str, + model: str, + output_path: Path, +) -> tuple[bool, float, str | None]: + """Send one chat-completion request and persist prompt + response. + + Returns (success, elapsed_seconds, response_id). + """ + messages = build_messages(bill_text) + payload = { + "model": model, + "messages": messages, + "seed": SEED, + } + start = time.monotonic() + record: dict = { + "bill_id": bill_id, + "label": label, + "model": model, + "seed": SEED, + "input_chars": len(bill_text), + "messages": messages, + } + try: + response = client.post(f"{OPENAI_API_BASE}/chat/completions", json=payload) + response.raise_for_status() + body = response.json() + except httpx.HTTPStatusError as error: + elapsed = time.monotonic() - start + record["error"] = { + "status_code": error.response.status_code, + "body": error.response.text, + "elapsed_seconds": elapsed, + } + output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2)) + logger.exception("HTTP error for %s/%s after %.2fs", label, bill_id, elapsed) + return False, elapsed, None + except Exception as error: + elapsed = time.monotonic() - start + record["error"] = {"message": str(error), "elapsed_seconds": elapsed} + output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2)) + logger.exception("Failed: %s/%s after %.2fs", label, bill_id, elapsed) + return False, elapsed, None + + elapsed = time.monotonic() - start + response_id = body.get("id") + record["response_id"] = response_id + record["elapsed_seconds"] = elapsed + record["usage"] = body.get("usage") + record["response"] = body + output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2)) + logger.info("Done: %s/%s id=%s in %.2fs", label, bill_id, response_id, elapsed) + return True, elapsed, response_id + + +def main( + csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"), + output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path( + "output/openai_runs", + ), + model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL, + count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT, + concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16, + log_level: Annotated[str, typer.Option(help="Log level")] = "INFO", +) -> None: + """Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text.""" + logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") + + api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY") + if not api_key: + message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set" + raise typer.BadParameter(message) + if not csv_path.is_file(): + message = f"CSV not found: {csv_path}" + raise typer.BadParameter(message) + + compressed_dir = output_dir / "compressed" + uncompressed_dir = output_dir / "uncompressed" + compressed_dir.mkdir(parents=True, exist_ok=True) + uncompressed_dir.mkdir(parents=True, exist_ok=True) + + logger.info("Loading %d bills from %s", count, csv_path) + bills = load_bills(csv_path, count) + if len(bills) < count: + logger.warning("Only %d bills available (requested %d)", len(bills), count) + + tasks: list[tuple[str, str, str, Path]] = [] + for bill_id, text_content in bills: + filename = f"{safe_filename(bill_id)}.json" + tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename)) + tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename)) + + logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency) + + headers = {"Authorization": f"Bearer {api_key}"} + completed = 0 + failed = 0 + index: list[dict] = [] + wall_start = time.monotonic() + with ( + httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client, + ThreadPoolExecutor( + max_workers=concurrency, + ) as executor, + ): + future_to_task = { + executor.submit( + run_one_request, + client, + bill_id=bill_id, + label=label, + bill_text=bill_text, + model=model, + output_path=output_path, + ): (bill_id, label, output_path) + for bill_id, label, bill_text, output_path in tasks + } + for future in as_completed(future_to_task): + bill_id, label, output_path = future_to_task[future] + success, elapsed, response_id = future.result() + if success: + completed += 1 + else: + failed += 1 + index.append( + { + "bill_id": bill_id, + "label": label, + "response_id": response_id, + "elapsed_seconds": elapsed, + "success": success, + "path": str(output_path), + }, + ) + wall_elapsed = time.monotonic() - wall_start + + summary = { + "model": model, + "count": len(bills), + "completed": completed, + "failed": failed, + "wall_seconds": wall_elapsed, + "concurrency": concurrency, + "results": index, + } + summary_path = output_dir / "summary.json" + summary_path.write_text(json.dumps(summary, indent=2)) + logger.info( + "Done: completed=%d failed=%d wall=%.1fs summary=%s", + completed, + failed, + wall_elapsed, + summary_path, + ) + + +def cli() -> None: + """Typer entry point.""" + typer.run(main) + + +if __name__ == "__main__": + cli()