mirror of
https://github.com/RichieCahill/dotfiles.git
synced 2026-04-17 13:08:19 -04:00
added bill_token_compression.py
tested on sample size of 100 bills matching the distribution of our data Compression saves ~11.5% on prompt tokens; completion/reasoning are roughly equal across the two sets. prompt completion reasoning total compressed 349,460 157,110 112,128 506,570 uncompressed 394,948 154,710 110,080 549,658 delta −45,488 +2,400 +2,048 −43,088
This commit is contained in:
162
python/prompt_bench/bill_token_compression.py
Normal file
162
python/prompt_bench/bill_token_compression.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""Lossless-ish text compression for Congressional bill text."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
STATES = (
|
||||
"Alabama",
|
||||
"Alaska",
|
||||
"Arizona",
|
||||
"Arkansas",
|
||||
"California",
|
||||
"Colorado",
|
||||
"Connecticut",
|
||||
"Delaware",
|
||||
"Florida",
|
||||
"Georgia",
|
||||
"Hawaii",
|
||||
"Idaho",
|
||||
"Illinois",
|
||||
"Indiana",
|
||||
"Iowa",
|
||||
"Kansas",
|
||||
"Kentucky",
|
||||
"Louisiana",
|
||||
"Maine",
|
||||
"Maryland",
|
||||
"Massachusetts",
|
||||
"Michigan",
|
||||
"Minnesota",
|
||||
"Mississippi",
|
||||
"Missouri",
|
||||
"Montana",
|
||||
"Nebraska",
|
||||
"Nevada",
|
||||
"New Hampshire",
|
||||
"New Jersey",
|
||||
"New Mexico",
|
||||
"New York",
|
||||
"North Carolina",
|
||||
"North Dakota",
|
||||
"Ohio",
|
||||
"Oklahoma",
|
||||
"Oregon",
|
||||
"Pennsylvania",
|
||||
"Rhode Island",
|
||||
"South Carolina",
|
||||
"South Dakota",
|
||||
"Tennessee",
|
||||
"Texas",
|
||||
"Utah",
|
||||
"Vermont",
|
||||
"Virginia",
|
||||
"Washington",
|
||||
"West Virginia",
|
||||
"Wisconsin",
|
||||
"Wyoming",
|
||||
"Puerto Rico",
|
||||
"Guam",
|
||||
"American Samoa",
|
||||
"District of Columbia",
|
||||
"US Virgin Islands",
|
||||
)
|
||||
STATE_PATTERNS = [(re.compile(re.escape(state), re.IGNORECASE), state) for state in STATES]
|
||||
|
||||
|
||||
def normalize_state_names(text: str) -> str:
|
||||
"""Replace any casing of state names with title case."""
|
||||
for pattern, replacement in STATE_PATTERNS:
|
||||
text = pattern.sub(replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def strip_number_commas(text: str) -> str:
|
||||
"""Remove commas from numeric thousands separators."""
|
||||
return re.sub(r"(\d{1,3}(?:,\d{3})+)", lambda match: match.group().replace(",", ""), text)
|
||||
|
||||
|
||||
def strip_horizontal_rules(text: str) -> str:
|
||||
"""Remove ASCII horizontal-rule lines built from underscores, dashes, equals, or asterisks."""
|
||||
return re.sub(r"^\s*[_\-=\*]{3,}\s*$", "", text, flags=re.MULTILINE)
|
||||
|
||||
|
||||
def collapse_double_dashes(text: str) -> str:
|
||||
"""Replace ``--`` em-dash stand-ins with a single space so they don't tokenize oddly."""
|
||||
return text.replace("--", " ")
|
||||
|
||||
|
||||
def collapse_inline_whitespace(text: str) -> str:
|
||||
"""Collapse runs of horizontal whitespace (spaces, tabs) into a single space, leaving newlines intact."""
|
||||
return re.sub(r"[^\S\n]+", " ", text)
|
||||
|
||||
|
||||
def collapse_blank_lines(text: str) -> str:
|
||||
"""Collapse three-or-more consecutive newlines down to a blank-line separator."""
|
||||
return re.sub(r"\n{3,}", "\n\n", text)
|
||||
|
||||
|
||||
def trim_line_edges(text: str) -> str:
|
||||
"""Strip spaces immediately before and after newline characters on every line."""
|
||||
text = re.sub(r" +\n", "\n", text)
|
||||
return re.sub(r"\n +", "\n", text)
|
||||
|
||||
|
||||
def shorten_section_markers(text: str) -> str:
|
||||
"""Rewrite ``Sec. 12.`` style section headings as the more compact ``SEC 12``."""
|
||||
return re.sub(r"(?i)sec\.\s*(\d+[a-zA-Z]?)\.", r"SEC \1", text)
|
||||
|
||||
|
||||
def unwrap_parens(text: str) -> str:
|
||||
"""Strip parentheses around short alphanumeric labels like ``(a)`` or ``(12)``."""
|
||||
return re.sub(r"\(([a-zA-Z0-9]+)\)", r"\1", text)
|
||||
|
||||
|
||||
def strip_typeset_quotes(text: str) -> str:
|
||||
"""Remove the `` and '' typeset quote markers used in the GPO bill format."""
|
||||
return text.replace("``", "").replace("''", "")
|
||||
|
||||
|
||||
def normalize_usc_acronym(text: str) -> str:
|
||||
"""Collapse ``U.S.C.`` to ``USC`` to save tokens on the common citation."""
|
||||
return text.replace("U.S.C.", "USC")
|
||||
|
||||
|
||||
def normalize_us_acronym(text: str) -> str:
|
||||
"""Normalize the various ``U.S.``/``U. S.`` spellings to the bare ``US`` form."""
|
||||
for acronym in ("U. S.", "u. s.", "U.S. ", "u.s. "):
|
||||
text = text.replace(acronym, "US ")
|
||||
return text
|
||||
|
||||
|
||||
def collapse_ellipses(text: str) -> str:
|
||||
"""Collapse runs of two-or-more periods (``...``, ``....``) down to a single period."""
|
||||
return re.sub(r"\.{2,}", ".", text)
|
||||
|
||||
|
||||
COMPRESSION_STEPS = (
|
||||
strip_horizontal_rules,
|
||||
collapse_double_dashes,
|
||||
collapse_inline_whitespace,
|
||||
collapse_blank_lines,
|
||||
trim_line_edges,
|
||||
shorten_section_markers,
|
||||
unwrap_parens,
|
||||
strip_typeset_quotes,
|
||||
normalize_usc_acronym,
|
||||
normalize_us_acronym,
|
||||
strip_number_commas,
|
||||
collapse_ellipses,
|
||||
normalize_state_names,
|
||||
)
|
||||
|
||||
|
||||
def compress_bill_text(text: str) -> str:
|
||||
"""Apply lossless-ish whitespace and boilerplate compression to bill text.
|
||||
|
||||
Runs every transform in :data:`COMPRESSION_STEPS` in order, then strips
|
||||
leading/trailing whitespace from the final result.
|
||||
"""
|
||||
for step in COMPRESSION_STEPS:
|
||||
text = step(text)
|
||||
return text.strip()
|
||||
270
python/prompt_bench/compresion_test.py
Normal file
270
python/prompt_bench/compresion_test.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""Run two interactive OpenAI chat-completion sweeps over bill text.
|
||||
|
||||
Reads the first N bills from a CSV with a `text_content` column and sends two
|
||||
sweeps through `/v1/chat/completions` concurrently — one with the raw bill
|
||||
text, one with the compressed bill text. Each request's prompt is saved to
|
||||
disk alongside the OpenAI response id so the prompts and responses can be
|
||||
correlated later.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from os import getenv
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
import httpx
|
||||
import typer
|
||||
|
||||
from python.prompt_bench.bill_token_compression import compress_bill_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPENAI_API_BASE = "https://api.openai.com/v1"
|
||||
DEFAULT_MODEL = "gpt-5.4-mini"
|
||||
DEFAULT_COUNT = 100
|
||||
SEED = 42
|
||||
|
||||
SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text.
|
||||
|
||||
Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections.
|
||||
|
||||
EXTRACTION RULES:
|
||||
- IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate.
|
||||
- FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH.
|
||||
- SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them.
|
||||
- BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains).
|
||||
- STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does.
|
||||
|
||||
OUTPUT FORMAT — plain structured text, not JSON:
|
||||
|
||||
OPERATIVE ACTIONS:
|
||||
[Numbered list of what the bill actually does, one action per line, max 20 words each]
|
||||
|
||||
AFFECTED POPULATIONS:
|
||||
[Who gains something, who loses something, or whose behavior is regulated]
|
||||
|
||||
MECHANISMS:
|
||||
[How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.]
|
||||
|
||||
POLICY THREADS:
|
||||
[List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.]
|
||||
|
||||
SYMBOLIC/PROCEDURAL ONLY:
|
||||
[Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?]
|
||||
|
||||
LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness."""
|
||||
|
||||
USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions.
|
||||
|
||||
BILL TEXT:
|
||||
{text_content}"""
|
||||
|
||||
|
||||
def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]:
|
||||
"""Return up to `count` (bill_id, text_content) tuples with non-empty text."""
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
bills: list[tuple[str, str]] = []
|
||||
with csv_path.open(newline="", encoding="utf-8") as handle:
|
||||
reader = csv.DictReader(handle)
|
||||
for row in reader:
|
||||
text_content = (row.get("text_content") or "").strip()
|
||||
if not text_content:
|
||||
continue
|
||||
bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
|
||||
version_code = row.get("version_code") or ""
|
||||
unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
|
||||
bills.append((unique_id, text_content))
|
||||
if len(bills) >= count:
|
||||
break
|
||||
return bills
|
||||
|
||||
|
||||
def build_messages(bill_text: str) -> list[dict]:
|
||||
"""Return the system + user message pair for a bill."""
|
||||
return [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": USER_TEMPLATE.format(text_content=bill_text)},
|
||||
]
|
||||
|
||||
|
||||
def safe_filename(value: str) -> str:
|
||||
"""Make a string safe for use as a filename."""
|
||||
return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"
|
||||
|
||||
|
||||
def run_one_request(
|
||||
client: httpx.Client,
|
||||
*,
|
||||
bill_id: str,
|
||||
label: str,
|
||||
bill_text: str,
|
||||
model: str,
|
||||
output_path: Path,
|
||||
) -> tuple[bool, float, str | None]:
|
||||
"""Send one chat-completion request and persist prompt + response.
|
||||
|
||||
Returns (success, elapsed_seconds, response_id).
|
||||
"""
|
||||
messages = build_messages(bill_text)
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"seed": SEED,
|
||||
}
|
||||
start = time.monotonic()
|
||||
record: dict = {
|
||||
"bill_id": bill_id,
|
||||
"label": label,
|
||||
"model": model,
|
||||
"seed": SEED,
|
||||
"input_chars": len(bill_text),
|
||||
"messages": messages,
|
||||
}
|
||||
try:
|
||||
response = client.post(f"{OPENAI_API_BASE}/chat/completions", json=payload)
|
||||
response.raise_for_status()
|
||||
body = response.json()
|
||||
except httpx.HTTPStatusError as error:
|
||||
elapsed = time.monotonic() - start
|
||||
record["error"] = {
|
||||
"status_code": error.response.status_code,
|
||||
"body": error.response.text,
|
||||
"elapsed_seconds": elapsed,
|
||||
}
|
||||
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
|
||||
logger.exception("HTTP error for %s/%s after %.2fs", label, bill_id, elapsed)
|
||||
return False, elapsed, None
|
||||
except Exception as error:
|
||||
elapsed = time.monotonic() - start
|
||||
record["error"] = {"message": str(error), "elapsed_seconds": elapsed}
|
||||
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
|
||||
logger.exception("Failed: %s/%s after %.2fs", label, bill_id, elapsed)
|
||||
return False, elapsed, None
|
||||
|
||||
elapsed = time.monotonic() - start
|
||||
response_id = body.get("id")
|
||||
record["response_id"] = response_id
|
||||
record["elapsed_seconds"] = elapsed
|
||||
record["usage"] = body.get("usage")
|
||||
record["response"] = body
|
||||
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
|
||||
logger.info("Done: %s/%s id=%s in %.2fs", label, bill_id, response_id, elapsed)
|
||||
return True, elapsed, response_id
|
||||
|
||||
|
||||
def main(
|
||||
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
|
||||
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
|
||||
"output/openai_runs",
|
||||
),
|
||||
model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
|
||||
count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
|
||||
concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
|
||||
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
|
||||
) -> None:
|
||||
"""Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
|
||||
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
|
||||
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
|
||||
raise typer.BadParameter(message)
|
||||
if not csv_path.is_file():
|
||||
message = f"CSV not found: {csv_path}"
|
||||
raise typer.BadParameter(message)
|
||||
|
||||
compressed_dir = output_dir / "compressed"
|
||||
uncompressed_dir = output_dir / "uncompressed"
|
||||
compressed_dir.mkdir(parents=True, exist_ok=True)
|
||||
uncompressed_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info("Loading %d bills from %s", count, csv_path)
|
||||
bills = load_bills(csv_path, count)
|
||||
if len(bills) < count:
|
||||
logger.warning("Only %d bills available (requested %d)", len(bills), count)
|
||||
|
||||
tasks: list[tuple[str, str, str, Path]] = []
|
||||
for bill_id, text_content in bills:
|
||||
filename = f"{safe_filename(bill_id)}.json"
|
||||
tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
|
||||
tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
|
||||
|
||||
logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)
|
||||
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
completed = 0
|
||||
failed = 0
|
||||
index: list[dict] = []
|
||||
wall_start = time.monotonic()
|
||||
with (
|
||||
httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client,
|
||||
ThreadPoolExecutor(
|
||||
max_workers=concurrency,
|
||||
) as executor,
|
||||
):
|
||||
future_to_task = {
|
||||
executor.submit(
|
||||
run_one_request,
|
||||
client,
|
||||
bill_id=bill_id,
|
||||
label=label,
|
||||
bill_text=bill_text,
|
||||
model=model,
|
||||
output_path=output_path,
|
||||
): (bill_id, label, output_path)
|
||||
for bill_id, label, bill_text, output_path in tasks
|
||||
}
|
||||
for future in as_completed(future_to_task):
|
||||
bill_id, label, output_path = future_to_task[future]
|
||||
success, elapsed, response_id = future.result()
|
||||
if success:
|
||||
completed += 1
|
||||
else:
|
||||
failed += 1
|
||||
index.append(
|
||||
{
|
||||
"bill_id": bill_id,
|
||||
"label": label,
|
||||
"response_id": response_id,
|
||||
"elapsed_seconds": elapsed,
|
||||
"success": success,
|
||||
"path": str(output_path),
|
||||
},
|
||||
)
|
||||
wall_elapsed = time.monotonic() - wall_start
|
||||
|
||||
summary = {
|
||||
"model": model,
|
||||
"count": len(bills),
|
||||
"completed": completed,
|
||||
"failed": failed,
|
||||
"wall_seconds": wall_elapsed,
|
||||
"concurrency": concurrency,
|
||||
"results": index,
|
||||
}
|
||||
summary_path = output_dir / "summary.json"
|
||||
summary_path.write_text(json.dumps(summary, indent=2))
|
||||
logger.info(
|
||||
"Done: completed=%d failed=%d wall=%.1fs summary=%s",
|
||||
completed,
|
||||
failed,
|
||||
wall_elapsed,
|
||||
summary_path,
|
||||
)
|
||||
|
||||
|
||||
def cli() -> None:
|
||||
"""Typer entry point."""
|
||||
typer.run(main)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
Reference in New Issue
Block a user