added bill_token_compression.py

tested on sample size of 100 bills matching the distribution of our data Compression saves ~11.5% on prompt tokens; completion/reasoning are roughly equal across the two sets. prompt completion reasoning total compressed 349,460 157,110 112,128 506,570 uncompressed 394,948 154,710 110,080 549,658 delta −45,488 +2,400 +2,048 −43,088
2026-04-17 21:18:18 -04:00 · 2026-04-09 18:41:13 -04:00
parent d17c883476
commit 251da6c14a
2 changed files with 432 additions and 0 deletions
--- a/python/prompt_bench/bill_token_compression.py
+++ b/python/prompt_bench/bill_token_compression.py
@@ -0,0 +1,162 @@
+"""Lossless-ish text compression for Congressional bill text."""
+
+from __future__ import annotations
+
+import re
+
+STATES = (
+    "Alabama",
+    "Alaska",
+    "Arizona",
+    "Arkansas",
+    "California",
+    "Colorado",
+    "Connecticut",
+    "Delaware",
+    "Florida",
+    "Georgia",
+    "Hawaii",
+    "Idaho",
+    "Illinois",
+    "Indiana",
+    "Iowa",
+    "Kansas",
+    "Kentucky",
+    "Louisiana",
+    "Maine",
+    "Maryland",
+    "Massachusetts",
+    "Michigan",
+    "Minnesota",
+    "Mississippi",
+    "Missouri",
+    "Montana",
+    "Nebraska",
+    "Nevada",
+    "New Hampshire",
+    "New Jersey",
+    "New Mexico",
+    "New York",
+    "North Carolina",
+    "North Dakota",
+    "Ohio",
+    "Oklahoma",
+    "Oregon",
+    "Pennsylvania",
+    "Rhode Island",
+    "South Carolina",
+    "South Dakota",
+    "Tennessee",
+    "Texas",
+    "Utah",
+    "Vermont",
+    "Virginia",
+    "Washington",
+    "West Virginia",
+    "Wisconsin",
+    "Wyoming",
+    "Puerto Rico",
+    "Guam",
+    "American Samoa",
+    "District of Columbia",
+    "US Virgin Islands",
+)
+STATE_PATTERNS = [(re.compile(re.escape(state), re.IGNORECASE), state) for state in STATES]
+
+
+def normalize_state_names(text: str) -> str:
+    """Replace any casing of state names with title case."""
+    for pattern, replacement in STATE_PATTERNS:
+        text = pattern.sub(replacement, text)
+    return text
+
+
+def strip_number_commas(text: str) -> str:
+    """Remove commas from numeric thousands separators."""
+    return re.sub(r"(\d{1,3}(?:,\d{3})+)", lambda match: match.group().replace(",", ""), text)
+
+
+def strip_horizontal_rules(text: str) -> str:
+    """Remove ASCII horizontal-rule lines built from underscores, dashes, equals, or asterisks."""
+    return re.sub(r"^\s*[_\-=\*]{3,}\s*$", "", text, flags=re.MULTILINE)
+
+
+def collapse_double_dashes(text: str) -> str:
+    """Replace ``--`` em-dash stand-ins with a single space so they don't tokenize oddly."""
+    return text.replace("--", " ")
+
+
+def collapse_inline_whitespace(text: str) -> str:
+    """Collapse runs of horizontal whitespace (spaces, tabs) into a single space, leaving newlines intact."""
+    return re.sub(r"[^\S\n]+", " ", text)
+
+
+def collapse_blank_lines(text: str) -> str:
+    """Collapse three-or-more consecutive newlines down to a blank-line separator."""
+    return re.sub(r"\n{3,}", "\n\n", text)
+
+
+def trim_line_edges(text: str) -> str:
+    """Strip spaces immediately before and after newline characters on every line."""
+    text = re.sub(r" +\n", "\n", text)
+    return re.sub(r"\n +", "\n", text)
+
+
+def shorten_section_markers(text: str) -> str:
+    """Rewrite ``Sec. 12.`` style section headings as the more compact ``SEC 12``."""
+    return re.sub(r"(?i)sec\.\s*(\d+[a-zA-Z]?)\.", r"SEC \1", text)
+
+
+def unwrap_parens(text: str) -> str:
+    """Strip parentheses around short alphanumeric labels like ``(a)`` or ``(12)``."""
+    return re.sub(r"\(([a-zA-Z0-9]+)\)", r"\1", text)
+
+
+def strip_typeset_quotes(text: str) -> str:
+    """Remove the `` and '' typeset quote markers used in the GPO bill format."""
+    return text.replace("``", "").replace("''", "")
+
+
+def normalize_usc_acronym(text: str) -> str:
+    """Collapse ``U.S.C.`` to ``USC`` to save tokens on the common citation."""
+    return text.replace("U.S.C.", "USC")
+
+
+def normalize_us_acronym(text: str) -> str:
+    """Normalize the various ``U.S.``/``U. S.`` spellings to the bare ``US`` form."""
+    for acronym in ("U. S.", "u. s.", "U.S. ", "u.s. "):
+        text = text.replace(acronym, "US ")
+    return text
+
+
+def collapse_ellipses(text: str) -> str:
+    """Collapse runs of two-or-more periods (``...``, ``....``) down to a single period."""
+    return re.sub(r"\.{2,}", ".", text)
+
+
+COMPRESSION_STEPS = (
+    strip_horizontal_rules,
+    collapse_double_dashes,
+    collapse_inline_whitespace,
+    collapse_blank_lines,
+    trim_line_edges,
+    shorten_section_markers,
+    unwrap_parens,
+    strip_typeset_quotes,
+    normalize_usc_acronym,
+    normalize_us_acronym,
+    strip_number_commas,
+    collapse_ellipses,
+    normalize_state_names,
+)
+
+
+def compress_bill_text(text: str) -> str:
+    """Apply lossless-ish whitespace and boilerplate compression to bill text.
+
+    Runs every transform in :data:`COMPRESSION_STEPS` in order, then strips
+    leading/trailing whitespace from the final result.
+    """
+    for step in COMPRESSION_STEPS:
+        text = step(text)
+    return text.strip()