mirror of
https://github.com/RichieCahill/dotfiles.git
synced 2026-04-17 21:18:18 -04:00
added bill_token_compression.py
tested on sample size of 100 bills matching the distribution of our data Compression saves ~11.5% on prompt tokens; completion/reasoning are roughly equal across the two sets. prompt completion reasoning total compressed 349,460 157,110 112,128 506,570 uncompressed 394,948 154,710 110,080 549,658 delta −45,488 +2,400 +2,048 −43,088
This commit is contained in:
162
python/prompt_bench/bill_token_compression.py
Normal file
162
python/prompt_bench/bill_token_compression.py
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
"""Lossless-ish text compression for Congressional bill text."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
STATES = (
|
||||||
|
"Alabama",
|
||||||
|
"Alaska",
|
||||||
|
"Arizona",
|
||||||
|
"Arkansas",
|
||||||
|
"California",
|
||||||
|
"Colorado",
|
||||||
|
"Connecticut",
|
||||||
|
"Delaware",
|
||||||
|
"Florida",
|
||||||
|
"Georgia",
|
||||||
|
"Hawaii",
|
||||||
|
"Idaho",
|
||||||
|
"Illinois",
|
||||||
|
"Indiana",
|
||||||
|
"Iowa",
|
||||||
|
"Kansas",
|
||||||
|
"Kentucky",
|
||||||
|
"Louisiana",
|
||||||
|
"Maine",
|
||||||
|
"Maryland",
|
||||||
|
"Massachusetts",
|
||||||
|
"Michigan",
|
||||||
|
"Minnesota",
|
||||||
|
"Mississippi",
|
||||||
|
"Missouri",
|
||||||
|
"Montana",
|
||||||
|
"Nebraska",
|
||||||
|
"Nevada",
|
||||||
|
"New Hampshire",
|
||||||
|
"New Jersey",
|
||||||
|
"New Mexico",
|
||||||
|
"New York",
|
||||||
|
"North Carolina",
|
||||||
|
"North Dakota",
|
||||||
|
"Ohio",
|
||||||
|
"Oklahoma",
|
||||||
|
"Oregon",
|
||||||
|
"Pennsylvania",
|
||||||
|
"Rhode Island",
|
||||||
|
"South Carolina",
|
||||||
|
"South Dakota",
|
||||||
|
"Tennessee",
|
||||||
|
"Texas",
|
||||||
|
"Utah",
|
||||||
|
"Vermont",
|
||||||
|
"Virginia",
|
||||||
|
"Washington",
|
||||||
|
"West Virginia",
|
||||||
|
"Wisconsin",
|
||||||
|
"Wyoming",
|
||||||
|
"Puerto Rico",
|
||||||
|
"Guam",
|
||||||
|
"American Samoa",
|
||||||
|
"District of Columbia",
|
||||||
|
"US Virgin Islands",
|
||||||
|
)
|
||||||
|
STATE_PATTERNS = [(re.compile(re.escape(state), re.IGNORECASE), state) for state in STATES]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_state_names(text: str) -> str:
|
||||||
|
"""Replace any casing of state names with title case."""
|
||||||
|
for pattern, replacement in STATE_PATTERNS:
|
||||||
|
text = pattern.sub(replacement, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def strip_number_commas(text: str) -> str:
|
||||||
|
"""Remove commas from numeric thousands separators."""
|
||||||
|
return re.sub(r"(\d{1,3}(?:,\d{3})+)", lambda match: match.group().replace(",", ""), text)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_horizontal_rules(text: str) -> str:
|
||||||
|
"""Remove ASCII horizontal-rule lines built from underscores, dashes, equals, or asterisks."""
|
||||||
|
return re.sub(r"^\s*[_\-=\*]{3,}\s*$", "", text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
|
def collapse_double_dashes(text: str) -> str:
|
||||||
|
"""Replace ``--`` em-dash stand-ins with a single space so they don't tokenize oddly."""
|
||||||
|
return text.replace("--", " ")
|
||||||
|
|
||||||
|
|
||||||
|
def collapse_inline_whitespace(text: str) -> str:
|
||||||
|
"""Collapse runs of horizontal whitespace (spaces, tabs) into a single space, leaving newlines intact."""
|
||||||
|
return re.sub(r"[^\S\n]+", " ", text)
|
||||||
|
|
||||||
|
|
||||||
|
def collapse_blank_lines(text: str) -> str:
|
||||||
|
"""Collapse three-or-more consecutive newlines down to a blank-line separator."""
|
||||||
|
return re.sub(r"\n{3,}", "\n\n", text)
|
||||||
|
|
||||||
|
|
||||||
|
def trim_line_edges(text: str) -> str:
|
||||||
|
"""Strip spaces immediately before and after newline characters on every line."""
|
||||||
|
text = re.sub(r" +\n", "\n", text)
|
||||||
|
return re.sub(r"\n +", "\n", text)
|
||||||
|
|
||||||
|
|
||||||
|
def shorten_section_markers(text: str) -> str:
|
||||||
|
"""Rewrite ``Sec. 12.`` style section headings as the more compact ``SEC 12``."""
|
||||||
|
return re.sub(r"(?i)sec\.\s*(\d+[a-zA-Z]?)\.", r"SEC \1", text)
|
||||||
|
|
||||||
|
|
||||||
|
def unwrap_parens(text: str) -> str:
|
||||||
|
"""Strip parentheses around short alphanumeric labels like ``(a)`` or ``(12)``."""
|
||||||
|
return re.sub(r"\(([a-zA-Z0-9]+)\)", r"\1", text)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_typeset_quotes(text: str) -> str:
|
||||||
|
"""Remove the `` and '' typeset quote markers used in the GPO bill format."""
|
||||||
|
return text.replace("``", "").replace("''", "")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_usc_acronym(text: str) -> str:
|
||||||
|
"""Collapse ``U.S.C.`` to ``USC`` to save tokens on the common citation."""
|
||||||
|
return text.replace("U.S.C.", "USC")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_us_acronym(text: str) -> str:
|
||||||
|
"""Normalize the various ``U.S.``/``U. S.`` spellings to the bare ``US`` form."""
|
||||||
|
for acronym in ("U. S.", "u. s.", "U.S. ", "u.s. "):
|
||||||
|
text = text.replace(acronym, "US ")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def collapse_ellipses(text: str) -> str:
|
||||||
|
"""Collapse runs of two-or-more periods (``...``, ``....``) down to a single period."""
|
||||||
|
return re.sub(r"\.{2,}", ".", text)
|
||||||
|
|
||||||
|
|
||||||
|
COMPRESSION_STEPS = (
|
||||||
|
strip_horizontal_rules,
|
||||||
|
collapse_double_dashes,
|
||||||
|
collapse_inline_whitespace,
|
||||||
|
collapse_blank_lines,
|
||||||
|
trim_line_edges,
|
||||||
|
shorten_section_markers,
|
||||||
|
unwrap_parens,
|
||||||
|
strip_typeset_quotes,
|
||||||
|
normalize_usc_acronym,
|
||||||
|
normalize_us_acronym,
|
||||||
|
strip_number_commas,
|
||||||
|
collapse_ellipses,
|
||||||
|
normalize_state_names,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compress_bill_text(text: str) -> str:
|
||||||
|
"""Apply lossless-ish whitespace and boilerplate compression to bill text.
|
||||||
|
|
||||||
|
Runs every transform in :data:`COMPRESSION_STEPS` in order, then strips
|
||||||
|
leading/trailing whitespace from the final result.
|
||||||
|
"""
|
||||||
|
for step in COMPRESSION_STEPS:
|
||||||
|
text = step(text)
|
||||||
|
return text.strip()
|
||||||
270
python/prompt_bench/compresion_test.py
Normal file
270
python/prompt_bench/compresion_test.py
Normal file
@@ -0,0 +1,270 @@
|
|||||||
|
"""Run two interactive OpenAI chat-completion sweeps over bill text.
|
||||||
|
|
||||||
|
Reads the first N bills from a CSV with a `text_content` column and sends two
|
||||||
|
sweeps through `/v1/chat/completions` concurrently — one with the raw bill
|
||||||
|
text, one with the compressed bill text. Each request's prompt is saved to
|
||||||
|
disk alongside the OpenAI response id so the prompts and responses can be
|
||||||
|
correlated later.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from os import getenv
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from python.prompt_bench.bill_token_compression import compress_bill_text
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
OPENAI_API_BASE = "https://api.openai.com/v1"
|
||||||
|
DEFAULT_MODEL = "gpt-5.4-mini"
|
||||||
|
DEFAULT_COUNT = 100
|
||||||
|
SEED = 42
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text.
|
||||||
|
|
||||||
|
Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections.
|
||||||
|
|
||||||
|
EXTRACTION RULES:
|
||||||
|
- IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate.
|
||||||
|
- FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH.
|
||||||
|
- SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them.
|
||||||
|
- BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains).
|
||||||
|
- STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does.
|
||||||
|
|
||||||
|
OUTPUT FORMAT — plain structured text, not JSON:
|
||||||
|
|
||||||
|
OPERATIVE ACTIONS:
|
||||||
|
[Numbered list of what the bill actually does, one action per line, max 20 words each]
|
||||||
|
|
||||||
|
AFFECTED POPULATIONS:
|
||||||
|
[Who gains something, who loses something, or whose behavior is regulated]
|
||||||
|
|
||||||
|
MECHANISMS:
|
||||||
|
[How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.]
|
||||||
|
|
||||||
|
POLICY THREADS:
|
||||||
|
[List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.]
|
||||||
|
|
||||||
|
SYMBOLIC/PROCEDURAL ONLY:
|
||||||
|
[Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?]
|
||||||
|
|
||||||
|
LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness."""
|
||||||
|
|
||||||
|
USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions.
|
||||||
|
|
||||||
|
BILL TEXT:
|
||||||
|
{text_content}"""
|
||||||
|
|
||||||
|
|
||||||
|
def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]:
|
||||||
|
"""Return up to `count` (bill_id, text_content) tuples with non-empty text."""
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
|
bills: list[tuple[str, str]] = []
|
||||||
|
with csv_path.open(newline="", encoding="utf-8") as handle:
|
||||||
|
reader = csv.DictReader(handle)
|
||||||
|
for row in reader:
|
||||||
|
text_content = (row.get("text_content") or "").strip()
|
||||||
|
if not text_content:
|
||||||
|
continue
|
||||||
|
bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
|
||||||
|
version_code = row.get("version_code") or ""
|
||||||
|
unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
|
||||||
|
bills.append((unique_id, text_content))
|
||||||
|
if len(bills) >= count:
|
||||||
|
break
|
||||||
|
return bills
|
||||||
|
|
||||||
|
|
||||||
|
def build_messages(bill_text: str) -> list[dict]:
|
||||||
|
"""Return the system + user message pair for a bill."""
|
||||||
|
return [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": USER_TEMPLATE.format(text_content=bill_text)},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def safe_filename(value: str) -> str:
|
||||||
|
"""Make a string safe for use as a filename."""
|
||||||
|
return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"
|
||||||
|
|
||||||
|
|
||||||
|
def run_one_request(
|
||||||
|
client: httpx.Client,
|
||||||
|
*,
|
||||||
|
bill_id: str,
|
||||||
|
label: str,
|
||||||
|
bill_text: str,
|
||||||
|
model: str,
|
||||||
|
output_path: Path,
|
||||||
|
) -> tuple[bool, float, str | None]:
|
||||||
|
"""Send one chat-completion request and persist prompt + response.
|
||||||
|
|
||||||
|
Returns (success, elapsed_seconds, response_id).
|
||||||
|
"""
|
||||||
|
messages = build_messages(bill_text)
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"messages": messages,
|
||||||
|
"seed": SEED,
|
||||||
|
}
|
||||||
|
start = time.monotonic()
|
||||||
|
record: dict = {
|
||||||
|
"bill_id": bill_id,
|
||||||
|
"label": label,
|
||||||
|
"model": model,
|
||||||
|
"seed": SEED,
|
||||||
|
"input_chars": len(bill_text),
|
||||||
|
"messages": messages,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
response = client.post(f"{OPENAI_API_BASE}/chat/completions", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
body = response.json()
|
||||||
|
except httpx.HTTPStatusError as error:
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
record["error"] = {
|
||||||
|
"status_code": error.response.status_code,
|
||||||
|
"body": error.response.text,
|
||||||
|
"elapsed_seconds": elapsed,
|
||||||
|
}
|
||||||
|
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
|
||||||
|
logger.exception("HTTP error for %s/%s after %.2fs", label, bill_id, elapsed)
|
||||||
|
return False, elapsed, None
|
||||||
|
except Exception as error:
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
record["error"] = {"message": str(error), "elapsed_seconds": elapsed}
|
||||||
|
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
|
||||||
|
logger.exception("Failed: %s/%s after %.2fs", label, bill_id, elapsed)
|
||||||
|
return False, elapsed, None
|
||||||
|
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
response_id = body.get("id")
|
||||||
|
record["response_id"] = response_id
|
||||||
|
record["elapsed_seconds"] = elapsed
|
||||||
|
record["usage"] = body.get("usage")
|
||||||
|
record["response"] = body
|
||||||
|
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
|
||||||
|
logger.info("Done: %s/%s id=%s in %.2fs", label, bill_id, response_id, elapsed)
|
||||||
|
return True, elapsed, response_id
|
||||||
|
|
||||||
|
|
||||||
|
def main(
|
||||||
|
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
|
||||||
|
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
|
||||||
|
"output/openai_runs",
|
||||||
|
),
|
||||||
|
model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
|
||||||
|
count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
|
||||||
|
concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
|
||||||
|
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
|
||||||
|
) -> None:
|
||||||
|
"""Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
|
||||||
|
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||||
|
|
||||||
|
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
|
||||||
|
raise typer.BadParameter(message)
|
||||||
|
if not csv_path.is_file():
|
||||||
|
message = f"CSV not found: {csv_path}"
|
||||||
|
raise typer.BadParameter(message)
|
||||||
|
|
||||||
|
compressed_dir = output_dir / "compressed"
|
||||||
|
uncompressed_dir = output_dir / "uncompressed"
|
||||||
|
compressed_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
uncompressed_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
logger.info("Loading %d bills from %s", count, csv_path)
|
||||||
|
bills = load_bills(csv_path, count)
|
||||||
|
if len(bills) < count:
|
||||||
|
logger.warning("Only %d bills available (requested %d)", len(bills), count)
|
||||||
|
|
||||||
|
tasks: list[tuple[str, str, str, Path]] = []
|
||||||
|
for bill_id, text_content in bills:
|
||||||
|
filename = f"{safe_filename(bill_id)}.json"
|
||||||
|
tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
|
||||||
|
tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
|
||||||
|
|
||||||
|
logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)
|
||||||
|
|
||||||
|
headers = {"Authorization": f"Bearer {api_key}"}
|
||||||
|
completed = 0
|
||||||
|
failed = 0
|
||||||
|
index: list[dict] = []
|
||||||
|
wall_start = time.monotonic()
|
||||||
|
with (
|
||||||
|
httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client,
|
||||||
|
ThreadPoolExecutor(
|
||||||
|
max_workers=concurrency,
|
||||||
|
) as executor,
|
||||||
|
):
|
||||||
|
future_to_task = {
|
||||||
|
executor.submit(
|
||||||
|
run_one_request,
|
||||||
|
client,
|
||||||
|
bill_id=bill_id,
|
||||||
|
label=label,
|
||||||
|
bill_text=bill_text,
|
||||||
|
model=model,
|
||||||
|
output_path=output_path,
|
||||||
|
): (bill_id, label, output_path)
|
||||||
|
for bill_id, label, bill_text, output_path in tasks
|
||||||
|
}
|
||||||
|
for future in as_completed(future_to_task):
|
||||||
|
bill_id, label, output_path = future_to_task[future]
|
||||||
|
success, elapsed, response_id = future.result()
|
||||||
|
if success:
|
||||||
|
completed += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
index.append(
|
||||||
|
{
|
||||||
|
"bill_id": bill_id,
|
||||||
|
"label": label,
|
||||||
|
"response_id": response_id,
|
||||||
|
"elapsed_seconds": elapsed,
|
||||||
|
"success": success,
|
||||||
|
"path": str(output_path),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
wall_elapsed = time.monotonic() - wall_start
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"model": model,
|
||||||
|
"count": len(bills),
|
||||||
|
"completed": completed,
|
||||||
|
"failed": failed,
|
||||||
|
"wall_seconds": wall_elapsed,
|
||||||
|
"concurrency": concurrency,
|
||||||
|
"results": index,
|
||||||
|
}
|
||||||
|
summary_path = output_dir / "summary.json"
|
||||||
|
summary_path.write_text(json.dumps(summary, indent=2))
|
||||||
|
logger.info(
|
||||||
|
"Done: completed=%d failed=%d wall=%.1fs summary=%s",
|
||||||
|
completed,
|
||||||
|
failed,
|
||||||
|
wall_elapsed,
|
||||||
|
summary_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def cli() -> None:
|
||||||
|
"""Typer entry point."""
|
||||||
|
typer.run(main)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli()
|
||||||
Reference in New Issue
Block a user