Compare commits

..

18 Commits

Author SHA1 Message Date
0d0ed5445a moved models 2026-04-19 21:05:56 -04:00
9e4c6f6f56 adding qwen3.6 2026-04-19 21:05:56 -04:00
1cf4b99d18 updating signing.format for programs.git 2026-04-19 10:35:48 -04:00
b536fb9f09 removed fallbackToPassword = true; 2026-04-19 08:08:33 -04:00
github-actions[bot]
c41a2ce3bd flake.lock: Update
Flake lock file updates:

• Updated input 'firefox-addons':
    'gitlab:rycee/nur-expressions/81e28f4?dir=pkgs/firefox-addons' (2026-03-20)
  → 'gitlab:rycee/nur-expressions/0581568?dir=pkgs/firefox-addons' (2026-04-17)
• Updated input 'home-manager':
    'github:nix-community/home-manager/9670de2' (2026-03-20)
  → 'github:nix-community/home-manager/565e534' (2026-04-17)
• Updated input 'nixos-hardware':
    'github:nixos/nixos-hardware/2d4b471' (2026-03-20)
  → 'github:nixos/nixos-hardware/c775c27' (2026-04-06)
• Updated input 'nixpkgs':
    'github:nixos/nixpkgs/b40629e' (2026-03-18)
  → 'github:nixos/nixpkgs/4bd9165' (2026-04-14)
• Updated input 'nixpkgs-master':
    'github:nixos/nixpkgs/8620c0b' (2026-03-21)
  → 'github:nixos/nixpkgs/025c852' (2026-04-17)
• Updated input 'sops-nix':
    'github:Mic92/sops-nix/29b6519' (2026-03-19)
  → 'github:Mic92/sops-nix/d4971dd' (2026-04-13)
2026-04-19 08:08:33 -04:00
8ef776f859 updating download-buffer-size 2026-04-18 22:12:47 -04:00
d350c2d074 adding codex 2026-04-18 20:14:11 -04:00
93d6914e9d enabling appimages 2026-04-18 20:14:11 -04:00
7db063a240 setting up whisper transcriber 2026-04-18 19:09:02 -04:00
dfe5997e0b removing brain_substituter.nix from bob 2026-04-18 12:00:59 -04:00
68671a1e84 adding steve 2026-04-18 11:56:56 -04:00
bcc2227cfd updating syncthing phone id 2026-04-18 11:53:20 -04:00
d6eec926e7 made web_services dir 2026-04-13 19:12:32 -04:00
5ddf1c4cab ran tree fmt 2026-04-13 19:12:32 -04:00
5a2171b9c7 updated gitea ssh settings 2026-04-13 19:12:32 -04:00
95c6ade154 moving off cloudflare tunnel 2026-04-13 19:12:32 -04:00
a0bbc2896a added math the bob 2026-04-13 19:08:42 -04:00
736596c387 made bob a server 2026-04-13 19:08:42 -04:00
65 changed files with 711 additions and 1873 deletions

4
.gitignore vendored
View File

@@ -169,7 +169,3 @@ test.*
# Frontend build output # Frontend build output
frontend/dist/ frontend/dist/
frontend/node_modules/ frontend/node_modules/
# data dir for training, validation, and testing
data/
config.toml

View File

@@ -40,7 +40,6 @@
"cgroupdriver", "cgroupdriver",
"charliermarsh", "charliermarsh",
"Checkpointing", "Checkpointing",
"cloudflared",
"codellama", "codellama",
"codezombiech", "codezombiech",
"compactmode", "compactmode",

View File

@@ -34,6 +34,7 @@ in
warn-dirty = false; warn-dirty = false;
flake-registry = ""; # disable global flake registries flake-registry = ""; # disable global flake registries
connect-timeout = 10; connect-timeout = 10;
download-buffer-size = 536870912;
fallback = true; fallback = true;
}; };

View File

@@ -12,7 +12,7 @@
brain.id = "SSCGIPI-IV3VYKB-TRNIJE3-COV4T2H-CDBER7F-I2CGHYA-NWOEUDU-3T5QAAN"; # cspell:disable-line brain.id = "SSCGIPI-IV3VYKB-TRNIJE3-COV4T2H-CDBER7F-I2CGHYA-NWOEUDU-3T5QAAN"; # cspell:disable-line
ipad.id = "KI76T3X-SFUGV2L-VSNYTKR-TSIUV5L-SHWD3HE-GQRGRCN-GY4UFMD-CW6Z6AX"; # cspell:disable-line ipad.id = "KI76T3X-SFUGV2L-VSNYTKR-TSIUV5L-SHWD3HE-GQRGRCN-GY4UFMD-CW6Z6AX"; # cspell:disable-line
jeeves.id = "ICRHXZW-ECYJCUZ-I4CZ64R-3XRK7CG-LL2HAAK-FGOHD22-BQA4AI6-5OAL6AG"; # cspell:disable-line jeeves.id = "ICRHXZW-ECYJCUZ-I4CZ64R-3XRK7CG-LL2HAAK-FGOHD22-BQA4AI6-5OAL6AG"; # cspell:disable-line
phone.id = "TBRULKD-7DZPGGZ-F6LLB7J-MSO54AY-7KLPBIN-QOFK6PX-W2HBEWI-PHM2CQI"; # cspell:disable-line phone.id = "JPVQKQW-CFXOJXT-Q5G5F3H-QIDHDRE-GKHPTQB-GXZUQSP-U7FR7F7-INP3AAH"; # cspell:disable-line
rhapsody-in-green.id = "ASL3KC4-3XEN6PA-7BQBRKE-A7JXLI6-DJT43BY-Q4WPOER-7UALUAZ-VTPQ6Q4"; # cspell:disable-line rhapsody-in-green.id = "ASL3KC4-3XEN6PA-7BQBRKE-A7JXLI6-DJT43BY-Q4WPOER-7UALUAZ-VTPQ6Q4"; # cspell:disable-line
}; };
}; };

36
flake.lock generated
View File

@@ -8,11 +8,11 @@
}, },
"locked": { "locked": {
"dir": "pkgs/firefox-addons", "dir": "pkgs/firefox-addons",
"lastModified": 1773979456, "lastModified": 1776398575,
"narHash": "sha256-9kBMJ5IvxqNlkkj/swmE8uK1Sc7TL/LIRUI958m7uBM=", "narHash": "sha256-WArU6WOdWxzbzGqYk4w1Mucg+bw/SCl6MoSp+/cZMio=",
"owner": "rycee", "owner": "rycee",
"repo": "nur-expressions", "repo": "nur-expressions",
"rev": "81e28f47ac18d9e89513929c77e711e657b64851", "rev": "05815686caf4e3678f5aeb5fd36e567886ab0d30",
"type": "gitlab" "type": "gitlab"
}, },
"original": { "original": {
@@ -29,11 +29,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1774007980, "lastModified": 1776454077,
"narHash": "sha256-FOnZjElEI8pqqCvB6K/1JRHTE8o4rer8driivTpq2uo=", "narHash": "sha256-7zSUFWsU0+jlD7WB3YAxQ84Z/iJurA5hKPm8EfEyGJk=",
"owner": "nix-community", "owner": "nix-community",
"repo": "home-manager", "repo": "home-manager",
"rev": "9670de2921812bc4e0452f6e3efd8c859696c183", "rev": "565e5349208fe7d0831ef959103c9bafbeac0681",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -44,11 +44,11 @@
}, },
"nixos-hardware": { "nixos-hardware": {
"locked": { "locked": {
"lastModified": 1774018263, "lastModified": 1775490113,
"narHash": "sha256-HHYEwK1A22aSaxv2ibhMMkKvrDGKGlA/qObG4smrSqc=", "narHash": "sha256-2ZBhDNZZwYkRmefK5XLOusCJHnoeKkoN95hoSGgMxWM=",
"owner": "nixos", "owner": "nixos",
"repo": "nixos-hardware", "repo": "nixos-hardware",
"rev": "2d4b4717b2534fad5c715968c1cece04a172b365", "rev": "c775c2772ba56e906cbeb4e0b2db19079ef11ff7",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -60,11 +60,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1773821835, "lastModified": 1776169885,
"narHash": "sha256-TJ3lSQtW0E2JrznGVm8hOQGVpXjJyXY2guAxku2O9A4=", "narHash": "sha256-l/iNYDZ4bGOAFQY2q8y5OAfBBtrDAaPuRQqWaFHVRXM=",
"owner": "nixos", "owner": "nixos",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "b40629efe5d6ec48dd1efba650c797ddbd39ace0", "rev": "4bd9165a9165d7b5e33ae57f3eecbcb28fb231c9",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -76,11 +76,11 @@
}, },
"nixpkgs-master": { "nixpkgs-master": {
"locked": { "locked": {
"lastModified": 1774051532, "lastModified": 1776469842,
"narHash": "sha256-d3CGMweyYIcPuTj5BKq+1Lx4zwlgL31nVtN647tOZKo=", "narHash": "sha256-sqzM6PKMQoGk8Sl+uv2sbP1qiS2SPQhA2yn5zgZINMc=",
"owner": "nixos", "owner": "nixos",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "8620c0b5cc8fbe76502442181be1d0514bc3a1b7", "rev": "025c852a89be820b3117f604c8ace42e9b4caa08",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -125,11 +125,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1773889674, "lastModified": 1776119890,
"narHash": "sha256-+ycaiVAk3MEshJTg35cBTUa0MizGiS+bgpYw/f8ohkg=", "narHash": "sha256-Zm6bxLNnEOYuS/SzrAGsYuXSwk3cbkRQZY0fJnk8a5M=",
"owner": "Mic92", "owner": "Mic92",
"repo": "sops-nix", "repo": "sops-nix",
"rev": "29b6519f3e0780452bca0ac0be4584f04ac16cc5", "rev": "d4971dd58c6627bfee52a1ad4237637c0a2fb0cd",
"type": "github" "type": "github"
}, },
"original": { "original": {

View File

@@ -23,8 +23,8 @@
apscheduler apscheduler
fastapi fastapi
fastapi-cli fastapi-cli
faster-whisper
httpx httpx
huggingface-hub
mypy mypy
orjson orjson
polars polars
@@ -42,7 +42,6 @@
sqlalchemy sqlalchemy
tenacity tenacity
textual textual
tiktoken
tinytuya tinytuya
typer typer
websockets websockets

View File

@@ -12,7 +12,6 @@ dependencies = [
"alembic", "alembic",
"apprise", "apprise",
"apscheduler", "apscheduler",
"huggingface-hub",
"httpx", "httpx",
"python-multipart", "python-multipart",
"polars", "polars",
@@ -27,11 +26,7 @@ dependencies = [
[project.scripts] [project.scripts]
database = "python.database_cli:app" database = "python.database_cli:app"
van-inventory = "python.van_inventory.main:serve" van-inventory = "python.van_inventory.main:serve"
prompt-bench = "python.prompt_bench.main:cli" whisper-transcribe = "python.tools.whisper.transcribe:main"
prompt-bench-download = "python.prompt_bench.downloader:cli"
finetune = "python.prompt_bench.finetune:cli"
finetune-container = "python.prompt_bench.finetune_container:cli"
build-finetune-dataset = "python.prompt_bench.build_finetune_dataset:cli"
[dependency-groups] [dependency-groups]
dev = [ dev = [
@@ -56,6 +51,7 @@ lint.ignore = [
"COM812", # (TEMP) conflicts when used with the formatter "COM812", # (TEMP) conflicts when used with the formatter
"ISC001", # (TEMP) conflicts when used with the formatter "ISC001", # (TEMP) conflicts when used with the formatter
"S603", # (PERM) This is known to cause a false positive "S603", # (PERM) This is known to cause a false positive
"S607", # (PERM) This is becoming a consistent annoyance
] ]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
@@ -84,14 +80,7 @@ lint.ignore = [
"python/congress_tracker/**" = [ "python/congress_tracker/**" = [
"TC003", # (perm) this creates issues because sqlalchemy uses these at runtime "TC003", # (perm) this creates issues because sqlalchemy uses these at runtime
] ]
"python/eval_warnings/**" = [
"S607", # (perm) gh and git are expected on PATH in the runner environment
]
"python/prompt_bench/**" = [
"FBT002", # (perm) typer requires boolean defaults for --flag/--no-flag options
"PLR0913", # (perm) typer CLIs naturally have many parameters
"S607", # (perm) docker and nvidia-smi are expected on PATH
]
"python/alembic/**" = [ "python/alembic/**" = [
"INP001", # (perm) this creates LSP issues for alembic "INP001", # (perm) this creates LSP issues for alembic
] ]

View File

@@ -1,25 +0,0 @@
# Unsloth fine-tuning container for Qwen 3.5 4B on RTX 3090.
#
# Build:
# docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
#
# Run:
# docker run --rm --device=nvidia.com/gpu=all --ipc=host \
# -v $(pwd)/output:/workspace/output \
# -v $(pwd)/output/finetune_dataset.jsonl:/workspace/dataset.jsonl:ro \
# -v /zfs/models/hf:/models \
# bill-finetune \
# --dataset /workspace/dataset.jsonl \
# --output-dir /workspace/output/qwen-bill-summarizer
FROM ghcr.io/unslothai/unsloth:latest
RUN pip install --no-cache-dir typer
WORKDIR /workspace
COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
COPY python/__init__.py python/__init__.py
ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]

View File

@@ -1 +0,0 @@
"""Prompt benchmarking system for evaluating LLMs via vLLM."""

View File

@@ -1,233 +0,0 @@
"""Submit an OpenAI Batch API bill-summarization job over compressed text.
Reads the first N bills from a CSV with a `text_content` column, compresses
each via `bill_token_compression.compress_bill_text`, builds a JSONL file of
summarization requests, and submits it as an asynchronous Batch API job
against `/v1/chat/completions`. Also writes a CSV of per-bill pre/post-
compression token counts.
"""
from __future__ import annotations
import csv
import json
import logging
import re
import sys
from os import getenv
from pathlib import Path
from typing import Annotated
import httpx
import typer
from tiktoken import Encoding, get_encoding
from python.prompt_bench.bill_token_compression import compress_bill_text
from python.prompt_bench.summarization_prompts import SUMMARIZATION_SYSTEM_PROMPT, SUMMARIZATION_USER_TEMPLATE
logger = logging.getLogger(__name__)
OPENAI_API_BASE = "https://api.openai.com/v1"
def load_bills(csv_path: Path, count: int = 0) -> list[tuple[str, str]]:
"""Return (bill_id, text_content) tuples with non-empty text.
If `count` is 0 or negative, all rows are returned.
"""
csv.field_size_limit(sys.maxsize)
bills: list[tuple[str, str]] = []
with csv_path.open(newline="", encoding="utf-8") as handle:
reader = csv.DictReader(handle)
for row in reader:
text_content = (row.get("text_content") or "").strip()
if not text_content:
continue
bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
version_code = row.get("version_code") or ""
unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
bills.append((unique_id, text_content))
if count > 0 and len(bills) >= count:
break
return bills
def safe_filename(value: str) -> str:
"""Make a string safe for use as a filename or batch custom_id."""
return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"
def build_request(custom_id: str, model: str, bill_text: str) -> dict:
"""Build one OpenAI batch request line."""
return {
"custom_id": custom_id,
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": model,
"messages": [
{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
{"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
],
},
}
def write_jsonl(path: Path, lines: list[dict]) -> None:
"""Write a list of dicts as JSONL."""
with path.open("w", encoding="utf-8") as handle:
for line in lines:
handle.write(json.dumps(line, ensure_ascii=False))
handle.write("\n")
def upload_file(client: httpx.Client, path: Path) -> str:
"""Upload a JSONL file to the OpenAI Files API and return its file id."""
with path.open("rb") as handle:
response = client.post(
f"{OPENAI_API_BASE}/files",
files={"file": (path.name, handle, "application/jsonl")},
data={"purpose": "batch"},
)
response.raise_for_status()
return response.json()["id"]
def prepare_requests(
bills: list[tuple[str, str]],
*,
model: str,
encoder: Encoding,
) -> tuple[list[dict], list[dict]]:
"""Build (request_lines, token_rows) from bills.
Each bill is compressed before being turned into a request line.
Each `token_rows` entry has chars + token counts for one bill so the caller
can write a per-bill CSV.
"""
request_lines: list[dict] = []
token_rows: list[dict] = []
for bill_id, text_content in bills:
raw_token_count = len(encoder.encode(text_content))
compressed_text = compress_bill_text(text_content)
compressed_token_count = len(encoder.encode(compressed_text))
token_rows.append(
{
"bill_id": bill_id,
"raw_chars": len(text_content),
"compressed_chars": len(compressed_text),
"raw_tokens": raw_token_count,
"compressed_tokens": compressed_token_count,
"token_ratio": (compressed_token_count / raw_token_count) if raw_token_count else None,
},
)
safe_id = safe_filename(bill_id)
request_lines.append(build_request(safe_id, model, compressed_text))
return request_lines, token_rows
def write_token_csv(path: Path, token_rows: list[dict]) -> tuple[int, int]:
"""Write per-bill token counts to CSV. Returns (raw_total, compressed_total)."""
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(
handle,
fieldnames=["bill_id", "raw_chars", "compressed_chars", "raw_tokens", "compressed_tokens", "token_ratio"],
)
writer.writeheader()
writer.writerows(token_rows)
raw_total = sum(row["raw_tokens"] for row in token_rows)
compressed_total = sum(row["compressed_tokens"] for row in token_rows)
return raw_total, compressed_total
def create_batch(client: httpx.Client, input_file_id: str, description: str) -> dict:
"""Create a batch job and return its full response payload."""
response = client.post(
f"{OPENAI_API_BASE}/batches",
json={
"input_file_id": input_file_id,
"endpoint": "/v1/chat/completions",
"completion_window": "24h",
"metadata": {"description": description},
},
)
response.raise_for_status()
return response.json()
def main(
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")] = Path(
"output/openai_batch",
),
model: Annotated[str, typer.Option(help="OpenAI model id")] = "gpt-5-mini",
count: Annotated[int, typer.Option(help="Max bills to process, 0 = all")] = 0,
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Submit an OpenAI Batch job of compressed bill summaries."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
if not api_key:
message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
raise typer.BadParameter(message)
if not csv_path.is_file():
message = f"CSV not found: {csv_path}"
raise typer.BadParameter(message)
output_dir.mkdir(parents=True, exist_ok=True)
logger.info("Loading %d bills from %s", count, csv_path)
bills = load_bills(csv_path, count)
if len(bills) < count:
logger.warning("Only %d bills available (requested %d)", len(bills), count)
encoder = get_encoding("o200k_base")
request_lines, token_rows = prepare_requests(bills, model=model, encoder=encoder)
token_csv_path = output_dir / "token_counts.csv"
raw_tokens_total, compressed_tokens_total = write_token_csv(token_csv_path, token_rows)
logger.info(
"Token counts: raw=%d compressed=%d ratio=%.3f -> %s",
raw_tokens_total,
compressed_tokens_total,
(compressed_tokens_total / raw_tokens_total) if raw_tokens_total else 0.0,
token_csv_path,
)
jsonl_path = output_dir / "requests.jsonl"
write_jsonl(jsonl_path, request_lines)
logger.info("Wrote %s (%d bills)", jsonl_path, len(request_lines))
headers = {"Authorization": f"Bearer {api_key}"}
with httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client:
logger.info("Uploading JSONL")
file_id = upload_file(client, jsonl_path)
logger.info("Uploaded: %s", file_id)
logger.info("Creating batch")
batch = create_batch(client, file_id, f"compressed bill summaries x{len(request_lines)} ({model})")
logger.info("Batch created: %s", batch["id"])
metadata = {
"model": model,
"count": len(bills),
"jsonl": str(jsonl_path),
"input_file_id": file_id,
"batch_id": batch["id"],
"raw_tokens_total": raw_tokens_total,
"compressed_tokens_total": compressed_tokens_total,
"batch": batch,
}
metadata_path = output_dir / "batch.json"
metadata_path.write_text(json.dumps(metadata, indent=2))
logger.info("Wrote metadata to %s", metadata_path)
def cli() -> None:
"""Typer entry point."""
typer.run(main)
if __name__ == "__main__":
cli()

View File

@@ -1,162 +0,0 @@
"""Lossless-ish text compression for Congressional bill text."""
from __future__ import annotations
import re
STATES = (
"Alabama",
"Alaska",
"Arizona",
"Arkansas",
"California",
"Colorado",
"Connecticut",
"Delaware",
"Florida",
"Georgia",
"Hawaii",
"Idaho",
"Illinois",
"Indiana",
"Iowa",
"Kansas",
"Kentucky",
"Louisiana",
"Maine",
"Maryland",
"Massachusetts",
"Michigan",
"Minnesota",
"Mississippi",
"Missouri",
"Montana",
"Nebraska",
"Nevada",
"New Hampshire",
"New Jersey",
"New Mexico",
"New York",
"North Carolina",
"North Dakota",
"Ohio",
"Oklahoma",
"Oregon",
"Pennsylvania",
"Rhode Island",
"South Carolina",
"South Dakota",
"Tennessee",
"Texas",
"Utah",
"Vermont",
"Virginia",
"Washington",
"West Virginia",
"Wisconsin",
"Wyoming",
"Puerto Rico",
"Guam",
"American Samoa",
"District of Columbia",
"US Virgin Islands",
)
STATE_PATTERNS = [(re.compile(re.escape(state), re.IGNORECASE), state) for state in STATES]
def normalize_state_names(text: str) -> str:
"""Replace any casing of state names with title case."""
for pattern, replacement in STATE_PATTERNS:
text = pattern.sub(replacement, text)
return text
def strip_number_commas(text: str) -> str:
"""Remove commas from numeric thousands separators."""
return re.sub(r"(\d{1,3}(?:,\d{3})+)", lambda match: match.group().replace(",", ""), text)
def strip_horizontal_rules(text: str) -> str:
"""Remove ASCII horizontal-rule lines built from underscores, dashes, equals, or asterisks."""
return re.sub(r"^\s*[_\-=\*]{3,}\s*$", "", text, flags=re.MULTILINE)
def collapse_double_dashes(text: str) -> str:
"""Replace ``--`` em-dash stand-ins with a single space so they don't tokenize oddly."""
return text.replace("--", " ")
def collapse_inline_whitespace(text: str) -> str:
"""Collapse runs of horizontal whitespace (spaces, tabs) into a single space, leaving newlines intact."""
return re.sub(r"[^\S\n]+", " ", text)
def collapse_blank_lines(text: str) -> str:
"""Collapse three-or-more consecutive newlines down to a blank-line separator."""
return re.sub(r"\n{3,}", "\n\n", text)
def trim_line_edges(text: str) -> str:
"""Strip spaces immediately before and after newline characters on every line."""
text = re.sub(r" +\n", "\n", text)
return re.sub(r"\n +", "\n", text)
def shorten_section_markers(text: str) -> str:
"""Rewrite ``Sec. 12.`` style section headings as the more compact ``SEC 12``."""
return re.sub(r"(?i)sec\.\s*(\d+[a-zA-Z]?)\.", r"SEC \1", text)
def unwrap_parens(text: str) -> str:
"""Strip parentheses around short alphanumeric labels like ``(a)`` or ``(12)``."""
return re.sub(r"\(([a-zA-Z0-9]+)\)", r"\1", text)
def strip_typeset_quotes(text: str) -> str:
"""Remove the `` and '' typeset quote markers used in the GPO bill format."""
return text.replace("``", "").replace("''", "")
def normalize_usc_acronym(text: str) -> str:
"""Collapse ``U.S.C.`` to ``USC`` to save tokens on the common citation."""
return text.replace("U.S.C.", "USC")
def normalize_us_acronym(text: str) -> str:
"""Normalize the various ``U.S.``/``U. S.`` spellings to the bare ``US`` form."""
for acronym in ("U. S.", "u. s.", "U.S. ", "u.s. "):
text = text.replace(acronym, "US ")
return text
def collapse_ellipses(text: str) -> str:
"""Collapse runs of two-or-more periods (``...``, ``....``) down to a single period."""
return re.sub(r"\.{2,}", ".", text)
COMPRESSION_STEPS = (
strip_horizontal_rules,
collapse_double_dashes,
collapse_inline_whitespace,
collapse_blank_lines,
trim_line_edges,
shorten_section_markers,
unwrap_parens,
strip_typeset_quotes,
normalize_usc_acronym,
normalize_us_acronym,
strip_number_commas,
collapse_ellipses,
normalize_state_names,
)
def compress_bill_text(text: str) -> str:
"""Apply lossless-ish whitespace and boilerplate compression to bill text.
Runs every transform in :data:`COMPRESSION_STEPS` in order, then strips
leading/trailing whitespace from the final result.
"""
for step in COMPRESSION_STEPS:
text = step(text)
return text.strip()

View File

@@ -1,236 +0,0 @@
"""Run two interactive OpenAI chat-completion sweeps over bill text.
Reads the first N bills from a CSV with a `text_content` column and sends two
sweeps through `/v1/chat/completions` concurrently — one with the raw bill
text, one with the compressed bill text. Each request's prompt is saved to
disk alongside the OpenAI response id so the prompts and responses can be
correlated later.
"""
from __future__ import annotations
import csv
import json
import logging
import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from os import getenv
from pathlib import Path
from typing import Annotated
import httpx
import typer
from python.prompt_bench.bill_token_compression import compress_bill_text
from python.prompt_bench.summarization_prompts import SUMMARIZATION_SYSTEM_PROMPT, SUMMARIZATION_USER_TEMPLATE
logger = logging.getLogger(__name__)
OPENAI_API_BASE = "https://api.openai.com/v1"
DEFAULT_MODEL = "gpt-5.4-mini"
DEFAULT_COUNT = 100
SEED = 42
def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]:
"""Return up to `count` (bill_id, text_content) tuples with non-empty text."""
csv.field_size_limit(sys.maxsize)
bills: list[tuple[str, str]] = []
with csv_path.open(newline="", encoding="utf-8") as handle:
reader = csv.DictReader(handle)
for row in reader:
text_content = (row.get("text_content") or "").strip()
if not text_content:
continue
bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
version_code = row.get("version_code") or ""
unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
bills.append((unique_id, text_content))
if len(bills) >= count:
break
return bills
def build_messages(bill_text: str) -> list[dict]:
"""Return the system + user message pair for a bill."""
return [
{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
{"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
]
def safe_filename(value: str) -> str:
"""Make a string safe for use as a filename."""
return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"
def run_one_request(
client: httpx.Client,
*,
bill_id: str,
label: str,
bill_text: str,
model: str,
output_path: Path,
) -> tuple[bool, float, str | None]:
"""Send one chat-completion request and persist prompt + response.
Returns (success, elapsed_seconds, response_id).
"""
messages = build_messages(bill_text)
payload = {
"model": model,
"messages": messages,
"seed": SEED,
}
start = time.monotonic()
record: dict = {
"bill_id": bill_id,
"label": label,
"model": model,
"seed": SEED,
"input_chars": len(bill_text),
"messages": messages,
}
try:
response = client.post(f"{OPENAI_API_BASE}/chat/completions", json=payload)
response.raise_for_status()
body = response.json()
except httpx.HTTPStatusError as error:
elapsed = time.monotonic() - start
record["error"] = {
"status_code": error.response.status_code,
"body": error.response.text,
"elapsed_seconds": elapsed,
}
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
logger.exception("HTTP error for %s/%s after %.2fs", label, bill_id, elapsed)
return False, elapsed, None
except Exception as error:
elapsed = time.monotonic() - start
record["error"] = {"message": str(error), "elapsed_seconds": elapsed}
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
logger.exception("Failed: %s/%s after %.2fs", label, bill_id, elapsed)
return False, elapsed, None
elapsed = time.monotonic() - start
response_id = body.get("id")
record["response_id"] = response_id
record["elapsed_seconds"] = elapsed
record["usage"] = body.get("usage")
record["response"] = body
output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
logger.info("Done: %s/%s id=%s in %.2fs", label, bill_id, response_id, elapsed)
return True, elapsed, response_id
def main(
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
"output/openai_runs",
),
model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
if not api_key:
message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
raise typer.BadParameter(message)
if not csv_path.is_file():
message = f"CSV not found: {csv_path}"
raise typer.BadParameter(message)
compressed_dir = output_dir / "compressed"
uncompressed_dir = output_dir / "uncompressed"
compressed_dir.mkdir(parents=True, exist_ok=True)
uncompressed_dir.mkdir(parents=True, exist_ok=True)
logger.info("Loading %d bills from %s", count, csv_path)
bills = load_bills(csv_path, count)
if len(bills) < count:
logger.warning("Only %d bills available (requested %d)", len(bills), count)
tasks: list[tuple[str, str, str, Path]] = []
for bill_id, text_content in bills:
filename = f"{safe_filename(bill_id)}.json"
tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)
headers = {"Authorization": f"Bearer {api_key}"}
completed = 0
failed = 0
index: list[dict] = []
wall_start = time.monotonic()
with (
httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client,
ThreadPoolExecutor(
max_workers=concurrency,
) as executor,
):
future_to_task = {
executor.submit(
run_one_request,
client,
bill_id=bill_id,
label=label,
bill_text=bill_text,
model=model,
output_path=output_path,
): (bill_id, label, output_path)
for bill_id, label, bill_text, output_path in tasks
}
for future in as_completed(future_to_task):
bill_id, label, output_path = future_to_task[future]
success, elapsed, response_id = future.result()
if success:
completed += 1
else:
failed += 1
index.append(
{
"bill_id": bill_id,
"label": label,
"response_id": response_id,
"elapsed_seconds": elapsed,
"success": success,
"path": str(output_path),
},
)
wall_elapsed = time.monotonic() - wall_start
summary = {
"model": model,
"count": len(bills),
"completed": completed,
"failed": failed,
"wall_seconds": wall_elapsed,
"concurrency": concurrency,
"results": index,
}
summary_path = output_dir / "summary.json"
summary_path.write_text(json.dumps(summary, indent=2))
logger.info(
"Done: completed=%d failed=%d wall=%.1fs summary=%s",
completed,
failed,
wall_elapsed,
summary_path,
)
def cli() -> None:
"""Typer entry point."""
typer.run(main)
if __name__ == "__main__":
cli()

View File

@@ -1 +0,0 @@
"""Prompt benchmarking system for evaluating LLMs via vLLM."""

View File

@@ -1,165 +0,0 @@
"""Docker container lifecycle management for Unsloth fine-tuning."""
from __future__ import annotations
import logging
import subprocess
from pathlib import Path
from typing import Annotated
import typer
from python.prompt_bench.containers.lib import check_gpu_free
logger = logging.getLogger(__name__)
CONTAINER_NAME = "bill-finetune"
FINETUNE_IMAGE = "bill-finetune:latest"
DOCKERFILE_PATH = "/home/richie/dotfiles/python/prompt_bench/Dockerfile.finetune"
DEFAULT_HF_CACHE = Path("/zfs/models/hf")
def build_image() -> None:
"""Build the fine-tuning Docker image."""
logger.info("Building fine-tuning image: %s", FINETUNE_IMAGE)
result = subprocess.run(
["docker", "build", "-f", DOCKERFILE_PATH, "-t", FINETUNE_IMAGE, "."],
text=True,
check=False,
)
if result.returncode != 0:
message = "Failed to build fine-tuning image"
raise RuntimeError(message)
logger.info("Image built: %s", FINETUNE_IMAGE)
def start_finetune(
*,
dataset_path: Path,
output_dir: Path,
hf_cache: Path = DEFAULT_HF_CACHE,
) -> None:
"""Run the fine-tuning container.
Args:
dataset_path: Host path to the fine-tuning JSONL dataset.
output_dir: Host path where the trained model will be saved.
hf_cache: Host path to HuggingFace model cache (bind-mounted to avoid re-downloading).
validation_split: Fraction of data held out for validation.
"""
dataset_path = dataset_path.resolve()
output_dir = output_dir.resolve()
if not dataset_path.is_file():
message = f"Dataset not found: {dataset_path}"
raise FileNotFoundError(message)
output_dir.mkdir(parents=True, exist_ok=True)
stop_finetune()
hf_cache = hf_cache.resolve()
hf_cache.mkdir(parents=True, exist_ok=True)
command = [
"docker",
"run",
"--name",
CONTAINER_NAME,
"--device=nvidia.com/gpu=all",
"--ipc=host",
"-v",
f"{hf_cache}:/root/.cache/huggingface",
"-v",
f"{output_dir}:/workspace/output/qwen-bill-summarizer",
"-v",
f"{dataset_path}:/workspace/dataset.jsonl:ro",
FINETUNE_IMAGE,
"--dataset",
"/workspace/dataset.jsonl",
"--output-dir",
"/workspace/output/qwen-bill-summarizer",
]
logger.info("Starting fine-tuning container")
logger.info(" Dataset: %s", dataset_path)
logger.info(" Output: %s", output_dir)
result = subprocess.run(command, text=True, check=False)
if result.returncode != 0:
message = f"Fine-tuning container exited with code {result.returncode}"
raise RuntimeError(message)
logger.info("Fine-tuning complete. Model saved to %s", output_dir)
def stop_finetune() -> None:
"""Stop and remove the fine-tuning container."""
logger.info("Stopping fine-tuning container")
subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
def logs_finetune() -> str | None:
"""Return recent logs from the fine-tuning container, or None if not running."""
result = subprocess.run(
["docker", "logs", "--tail", "50", CONTAINER_NAME],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
return None
return result.stdout + result.stderr
app = typer.Typer(help="Fine-tuning container management.")
@app.command()
def build() -> None:
"""Build the fine-tuning Docker image."""
build_image()
@app.command()
def run(
dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path(
"/home/richie/dotfiles/data/finetune_dataset.jsonl"
),
output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
"/home/richie/dotfiles/data/output/qwen-bill-summarizer",
),
hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Run fine-tuning inside a Docker container."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
check_gpu_free()
start_finetune(
dataset_path=dataset,
output_dir=output_dir,
hf_cache=hf_cache,
)
@app.command()
def stop() -> None:
"""Stop and remove the fine-tuning container."""
stop_finetune()
@app.command()
def logs() -> None:
"""Show recent logs from the fine-tuning container."""
output = logs_finetune()
if output is None:
typer.echo("No running fine-tuning container found.")
raise typer.Exit(code=1)
typer.echo(output)
def cli() -> None:
"""Typer entry point."""
app()
if __name__ == "__main__":
cli()

View File

@@ -1,23 +0,0 @@
from __future__ import annotations
import logging
import subprocess
logger = logging.getLogger(__name__)
def check_gpu_free() -> None:
"""Warn if GPU-heavy processes (e.g. Ollama) are running."""
result = subprocess.run(
["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
logger.warning("Could not query GPU processes: %s", result.stderr.strip())
return
processes = result.stdout.strip()
if processes:
logger.warning("GPU processes detected:\n%s", processes)
logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")

View File

@@ -1,70 +0,0 @@
"""Docker container lifecycle management for vLLM."""
from __future__ import annotations
import logging
import subprocess
logger = logging.getLogger(__name__)
CONTAINER_NAME = "vllm-bench"
VLLM_IMAGE = "vllm/vllm-openai:v0.19.0"
def start_vllm(
*,
model: str,
port: int,
model_dir: str,
gpu_memory_utilization: float,
) -> None:
"""Start a vLLM container serving the given model.
Args:
model: HuggingFace model directory name (relative to model_dir).
port: Host port to bind.
model_dir: Host path containing HuggingFace model directories.
gpu_memory_utilization: Fraction of GPU memory to use (0-1).
"""
command = [
"docker",
"run",
"-d",
"--name",
CONTAINER_NAME,
"--device=nvidia.com/gpu=all",
"--ipc=host",
"-v",
f"{model_dir}:/models",
"-p",
f"{port}:8000",
VLLM_IMAGE,
"--model",
f"/models/{model}",
"--served-model-name",
model,
"--gpu-memory-utilization",
str(gpu_memory_utilization),
"--max-model-len",
"4096",
]
logger.info("Starting vLLM container with model: %s", model)
stop_vllm()
result = subprocess.run(command, capture_output=True, text=True, check=False)
if result.returncode != 0:
msg = f"Failed to start vLLM container: {result.stderr.strip()}"
raise RuntimeError(msg)
logger.info("vLLM container started: %s", result.stdout.strip()[:12])
def stop_vllm() -> None:
"""Stop and remove the vLLM benchmark container."""
logger.info("Stopping vLLM container")
subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
subprocess.run(
["docker", "network", "disconnect", "-f", "bridge", CONTAINER_NAME],
capture_output=True,
check=False,
)
logger.info("vLLM container stopped and removed")

View File

@@ -1,75 +0,0 @@
"""HuggingFace model downloader."""
from __future__ import annotations
import logging
from pathlib import Path
from typing import Annotated
import typer
from huggingface_hub import snapshot_download
from python.prompt_bench.models import BenchmarkConfig
logger = logging.getLogger(__name__)
def local_model_path(repo: str, model_dir: str) -> Path:
"""Return the local directory path for a HuggingFace repo."""
return Path(model_dir) / repo
def is_model_present(repo: str, model_dir: str) -> bool:
"""Check if a model has already been downloaded."""
path = local_model_path(repo, model_dir)
return path.exists() and any(path.iterdir())
def download_model(repo: str, model_dir: str) -> Path:
"""Download a HuggingFace model to the local model directory.
Skips the download if the model directory already exists and contains files.
"""
local_path = local_model_path(repo, model_dir)
if is_model_present(repo, model_dir):
logger.info("Model already exists: %s", local_path)
return local_path
logger.info("Downloading model: %s -> %s", repo, local_path)
snapshot_download(
repo_id=repo,
local_dir=str(local_path),
)
logger.info("Download complete: %s", repo)
return local_path
def download_all(config: BenchmarkConfig) -> None:
"""Download every model listed in the config, top to bottom."""
for repo in config.models:
download_model(repo, config.model_dir)
def main(
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Download all models listed in the benchmark config."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
if not config.is_file():
message = f"Config file does not exist: {config}"
raise typer.BadParameter(message)
benchmark_config = BenchmarkConfig.from_toml(config)
download_all(benchmark_config)
def cli() -> None:
"""Typer entry point."""
typer.run(main)
if __name__ == "__main__":
cli()

View File

@@ -1,214 +0,0 @@
"""Fine-tune Qwen 3.5 4B on bill summarization data using Unsloth.
Loads a ChatML-style JSONL dataset (system/user/assistant messages),
applies QLoRA with 4-bit quantization, and saves the merged model
in HuggingFace format. Designed for a single RTX 3090 (24GB).
Usage:
python -m python.prompt_bench.finetune \
--dataset output/finetune_dataset.jsonl \
--output-dir output/qwen-bill-summarizer
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Annotated
import tomllib
import typer
from unsloth import FastLanguageModel
from datasets import Dataset
from transformers import TrainingArguments
from trl import SFTTrainer
logger = logging.getLogger(__name__)
@dataclass
class LoraConfig:
"""LoRA adapter hyperparameters."""
rank: int
alpha: int
dropout: float
targets: list[str]
@dataclass
class TrainingConfig:
"""Training loop hyperparameters."""
learning_rate: float
epochs: int
batch_size: int
gradient_accumulation: int
max_seq_length: int
warmup_ratio: float
weight_decay: float
logging_steps: int
save_steps: int
@dataclass
class FinetuneConfig:
"""Top-level finetune configuration."""
base_model: str
lora: LoraConfig
training: TrainingConfig
@classmethod
def from_toml(cls, config_path: Path) -> FinetuneConfig:
"""Load finetune config from a TOML file."""
raw = tomllib.loads(config_path.read_text())["finetune"]
return cls(
base_model=raw["base_model"],
lora=LoraConfig(**raw["lora"]),
training=TrainingConfig(**raw["training"]),
)
def _messages_to_chatml(messages: list[dict]) -> str:
r"""Convert a message list to Qwen ChatML format.
Produces:
<|im_start|>system\n...\n<|im_end|>
<|im_start|>user\n...\n<|im_end|>
<|im_start|>assistant\n...\n<|im_end|>
"""
parts = []
for message in messages:
role = message["role"]
content = message["content"]
parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
return "\n".join(parts)
def load_dataset_from_jsonl(path: Path) -> Dataset:
"""Load a ChatML JSONL file into a HuggingFace Dataset.
Each line must have {"messages": [{"role": ..., "content": ...}, ...]}.
Pre-formats into a `text` column with the Qwen ChatML template applied,
which SFTTrainer consumes directly.
"""
records = []
with path.open(encoding="utf-8") as handle:
for raw_line in handle:
stripped = raw_line.strip()
if stripped:
entry = json.loads(stripped)
records.append({"text": _messages_to_chatml(entry["messages"])})
logger.info("Loaded %d examples from %s", len(records), path)
return Dataset.from_list(records)
def main(
dataset_path: Annotated[Path, typer.Option("--dataset", help="Fine-tuning JSONL")] = Path(
"output/finetune_dataset.jsonl",
),
validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to save the merged model")] = Path(
"output/qwen-bill-summarizer",
),
config_path: Annotated[
Path,
typer.Option("--config", help="TOML config file"),
] = Path(__file__).parent / "config.toml",
save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
) -> None:
"""Fine-tune Qwen 3.5 4B on bill summarization with Unsloth + QLoRA."""
logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
if not dataset_path.is_file():
message = f"Dataset not found: {dataset_path}"
raise typer.BadParameter(message)
config = FinetuneConfig.from_toml(config_path)
logger.info("Loading base model: %s", config.base_model)
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=config.base_model,
max_seq_length=config.training.max_seq_length,
load_in_4bit=True,
dtype=None,
)
logger.info("Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha)
model = FastLanguageModel.get_peft_model(
model,
r=config.lora.rank,
lora_alpha=config.lora.alpha,
lora_dropout=config.lora.dropout,
target_modules=config.lora.targets,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
full_dataset = load_dataset_from_jsonl(dataset_path)
split = full_dataset.train_test_split(test_size=validation_split, seed=42)
train_dataset = split["train"]
validation_dataset = split["test"]
logger.info("Split: %d train, %d validation", len(train_dataset), len(validation_dataset))
training_args = TrainingArguments(
output_dir=str(output_dir / "checkpoints"),
num_train_epochs=config.training.epochs,
per_device_train_batch_size=config.training.batch_size,
gradient_accumulation_steps=config.training.gradient_accumulation,
learning_rate=config.training.learning_rate,
warmup_ratio=config.training.warmup_ratio,
weight_decay=config.training.weight_decay,
lr_scheduler_type="cosine",
logging_steps=config.training.logging_steps,
save_steps=config.training.save_steps,
save_total_limit=3,
eval_strategy="steps",
eval_steps=config.training.save_steps,
load_best_model_at_end=True,
bf16=True,
optim="adamw_8bit",
seed=42,
report_to="none",
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=validation_dataset,
args=training_args,
max_seq_length=config.training.max_seq_length,
packing=True,
)
logger.info(
"Starting training: %d train, %d val, %d epochs",
len(train_dataset),
len(validation_dataset),
config.training.epochs,
)
trainer.train()
merged_path = str(output_dir / "merged")
logger.info("Saving merged model to %s", merged_path)
model.save_pretrained_merged(merged_path, tokenizer, save_method="merged_16bit")
if save_gguf:
gguf_path = str(output_dir / "gguf")
logger.info("Saving GGUF to %s", gguf_path)
model.save_pretrained_gguf(gguf_path, tokenizer, quantization_method="q4_k_m")
logger.info("Done! Model saved to %s", output_dir)
def cli() -> None:
"""Typer entry point."""
typer.run(main)
if __name__ == "__main__":
cli()

View File

@@ -1,215 +0,0 @@
"""CLI entry point for the prompt benchmarking system."""
from __future__ import annotations
import json
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Annotated
import typer
from python.prompt_bench.containers.lib import check_gpu_free
from python.prompt_bench.containers.vllm import start_vllm, stop_vllm
from python.prompt_bench.downloader import is_model_present
from python.prompt_bench.models import BenchmarkConfig
from python.prompt_bench.vllm_client import VLLMClient
logger = logging.getLogger(__name__)
def discover_prompts(input_dir: Path) -> list[Path]:
"""Find all .txt files in the input directory."""
prompts = list(input_dir.glob("*.txt"))
if not prompts:
message = f"No .txt files found in {input_dir}"
raise FileNotFoundError(message)
return prompts
def _run_prompt(
client: VLLMClient,
prompt_path: Path,
*,
repo: str,
model_dir_name: str,
model_output: Path,
temperature: float,
) -> tuple[bool, float]:
"""Run a single prompt. Returns (success, elapsed_seconds)."""
filename = prompt_path.name
output_path = model_output / filename
start = time.monotonic()
try:
prompt_text = prompt_path.read_text()
response = client.complete(prompt_text, model_dir_name, temperature=temperature)
output_path.write_text(response)
elapsed = time.monotonic() - start
logger.info("Completed: %s / %s in %.2fs", repo, filename, elapsed)
except Exception:
elapsed = time.monotonic() - start
error_path = model_output / f"{filename}.error"
logger.exception("Failed: %s / %s after %.2fs", repo, filename, elapsed)
error_path.write_text(f"Error processing {filename}")
return False, elapsed
return True, elapsed
def benchmark_model(
client: VLLMClient,
prompts: list[Path],
*,
repo: str,
model_dir_name: str,
model_output: Path,
temperature: float,
concurrency: int,
) -> tuple[int, int]:
"""Run all prompts against a single model in parallel.
vLLM batches concurrent requests internally, so submitting many at once is
significantly faster than running them serially.
"""
pending = [prompt for prompt in prompts if not (model_output / prompt.name).exists()]
skipped = len(prompts) - len(pending)
if skipped:
logger.info("Skipping %d prompts with existing output for %s", skipped, repo)
if not pending:
logger.info("Nothing to do for %s", repo)
return 0, 0
completed = 0
failed = 0
latencies: list[float] = []
wall_start = time.monotonic()
with ThreadPoolExecutor(max_workers=concurrency) as executor:
futures = [
executor.submit(
_run_prompt,
client,
prompt_path,
repo=repo,
model_dir_name=model_dir_name,
model_output=model_output,
temperature=temperature,
)
for prompt_path in pending
]
for future in as_completed(futures):
success, elapsed = future.result()
latencies.append(elapsed)
if success:
completed += 1
else:
failed += 1
wall_elapsed = time.monotonic() - wall_start
attempted = completed + failed
avg_latency = sum(latencies) / attempted
throughput = attempted / wall_elapsed if wall_elapsed > 0 else 0.0
timing = {
"repo": repo,
"wall_seconds": wall_elapsed,
"attempted": attempted,
"completed": completed,
"failed": failed,
"avg_latency_seconds": avg_latency,
"throughput_prompts_per_second": throughput,
"concurrency": concurrency,
}
timing_path = model_output / "_timing.json"
timing_path.write_text(json.dumps(timing, indent=2))
return completed, failed
def run_benchmark(
config: BenchmarkConfig,
input_dir: Path,
output_dir: Path,
) -> None:
"""Execute the benchmark across all models and prompts."""
prompts = discover_prompts(input_dir)
logger.info("Found %d prompts in %s", len(prompts), input_dir)
check_gpu_free()
total_completed = 0
total_failed = 0
for repo in config.models:
if not is_model_present(repo, config.model_dir):
logger.warning("Skipping (not downloaded): %s", repo)
continue
model_output = output_dir / repo
model_output.mkdir(parents=True, exist_ok=True)
logger.info("=== Benchmarking model: %s ===", repo)
stop_vllm()
try:
start_vllm(
model=repo,
port=config.port,
model_dir=config.model_dir,
gpu_memory_utilization=config.gpu_memory_utilization,
)
except RuntimeError:
logger.exception("Failed to start vLLM for %s, skipping", repo)
continue
logger.info("vLLM started for %s", repo)
try:
with VLLMClient(port=config.port, timeout=config.timeout) as client:
client.wait_ready(max_wait=config.vllm_startup_timeout)
completed, failed = benchmark_model(
client,
prompts,
repo=repo,
model_dir_name=repo,
model_output=model_output,
temperature=config.temperature,
concurrency=config.concurrency,
)
total_completed += completed
total_failed += failed
finally:
stop_vllm()
logger.info("=== Benchmark complete ===")
logger.info("Completed: %d | Failed: %d", total_completed, total_failed)
def main(
input_dir: Annotated[Path, typer.Argument(help="Directory containing input .txt prompt files")],
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
output_dir: Annotated[Path, typer.Option(help="Output directory for results")] = Path("output"),
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Run prompts through multiple LLMs via vLLM and save results."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
if not input_dir.is_dir():
message = f"Input directory does not exist: {input_dir}"
raise typer.BadParameter(message)
if not config.is_file():
message = f"Config file does not exist: {config}"
raise typer.BadParameter(message)
benchmark_config = BenchmarkConfig.from_toml(config)
output_dir.mkdir(parents=True, exist_ok=True)
run_benchmark(benchmark_config, input_dir, output_dir)
def cli() -> None:
"""Typer entry point."""
typer.run(main)
if __name__ == "__main__":
cli()

View File

@@ -1,30 +0,0 @@
"""Pydantic models for benchmark configuration."""
from __future__ import annotations
import tomllib
from typing import TYPE_CHECKING
from pydantic import BaseModel
if TYPE_CHECKING:
from pathlib import Path
class BenchmarkConfig(BaseModel):
"""Top-level benchmark configuration loaded from TOML."""
models: list[str]
model_dir: str = "/zfs/models/hf"
port: int = 8000
gpu_memory_utilization: float = 0.90
temperature: float = 0.0
timeout: int = 300
concurrency: int = 4
vllm_startup_timeout: int = 900
@classmethod
def from_toml(cls, config_path: Path) -> BenchmarkConfig:
"""Load benchmark config from a TOML file."""
raw = tomllib.loads(config_path.read_text())["bench"]
return cls(**raw)

View File

@@ -1,34 +0,0 @@
SUMMARIZATION_SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text.
Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections.
EXTRACTION RULES:
- IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate.
- FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH.
- SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them.
- BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains).
- STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does.
OUTPUT FORMAT — plain structured text, not JSON:
OPERATIVE ACTIONS:
[Numbered list of what the bill actually does, one action per line, max 20 words each]
AFFECTED POPULATIONS:
[Who gains something, who loses something, or whose behavior is regulated]
MECHANISMS:
[How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.]
POLICY THREADS:
[List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.]
SYMBOLIC/PROCEDURAL ONLY:
[Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?]
LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness."""
SUMMARIZATION_USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions.
BILL TEXT:
{text_content}"""

View File

@@ -1,114 +0,0 @@
"""Build a fine-tuning JSONL dataset from batch request + output files.
Joins the original request JSONL (system + user messages) with the batch
output JSONL (assistant completions) by custom_id to produce a ChatML-style
messages JSONL suitable for fine-tuning.
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Annotated
import typer
logger = logging.getLogger(__name__)
HTTP_OK = 200
def load_requests(path: Path) -> dict[str, list[dict]]:
"""Parse request JSONL into {custom_id: messages}."""
results: dict[str, list[dict]] = {}
with path.open(encoding="utf-8") as handle:
for raw_line in handle:
stripped = raw_line.strip()
if not stripped:
continue
record = json.loads(stripped)
custom_id = record["custom_id"]
messages = record["body"]["messages"]
results[custom_id] = messages
return results
def load_completions(path: Path) -> dict[str, str]:
"""Parse batch output JSONL into {custom_id: assistant_content}."""
results: dict[str, str] = {}
with path.open(encoding="utf-8") as handle:
for line_number, raw_line in enumerate(handle, 1):
stripped = raw_line.strip()
if not stripped:
continue
record = json.loads(stripped)
custom_id = record["custom_id"]
response = record.get("response", {})
if response.get("status_code") != HTTP_OK:
logger.warning("Skipping %s (line %d): status %s", custom_id, line_number, response.get("status_code"))
continue
body = response.get("body", {})
choices = body.get("choices", [])
if not choices:
logger.warning("Skipping %s (line %d): no choices", custom_id, line_number)
continue
content = choices[0].get("message", {}).get("content", "")
if not content:
logger.warning("Skipping %s (line %d): empty content", custom_id, line_number)
continue
results[custom_id] = content
return results
def main(
requests_path: Annotated[Path, typer.Option("--requests", help="Batch request JSONL")] = Path(
"output/openai_batch/requests.jsonl",
),
batch_output: Annotated[Path, typer.Option("--batch-output", help="Batch output JSONL")] = Path(
"batch_69d84558d91c819091d53f08d78f9fd6_output.jsonl",
),
output_path: Annotated[Path, typer.Option("--output", help="Fine-tuning JSONL output")] = Path(
"output/finetune_dataset.jsonl",
),
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Build fine-tuning dataset by joining request and output JSONL files."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logger.info("Loading requests from %s", requests_path)
requests = load_requests(requests_path)
logger.info("Loaded %d requests", len(requests))
logger.info("Loading completions from %s", batch_output)
completions = load_completions(batch_output)
logger.info("Loaded %d completions", len(completions))
output_path.parent.mkdir(parents=True, exist_ok=True)
matched = 0
skipped = 0
with output_path.open("w", encoding="utf-8") as handle:
for custom_id, messages in requests.items():
assistant_content = completions.get(custom_id)
if assistant_content is None:
skipped += 1
continue
example = {
"messages": [*messages, {"role": "assistant", "content": assistant_content}],
}
handle.write(json.dumps(example, ensure_ascii=False))
handle.write("\n")
matched += 1
logger.info("Wrote %d examples to %s (skipped %d unmatched)", matched, output_path, skipped)
def cli() -> None:
"""Typer entry point."""
typer.run(main)
if __name__ == "__main__":
cli()

View File

@@ -1,97 +0,0 @@
"""Sum token usage across compressed and uncompressed run directories."""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Annotated
import typer
logger = logging.getLogger(__name__)
@dataclass
class UsageTotals:
"""Aggregate usage counters for a directory of run records."""
files: int = 0
errors: int = 0
prompt_tokens: int = 0
cached_tokens: int = 0
completion_tokens: int = 0
reasoning_tokens: int = 0
total_tokens: int = 0
per_file: list[tuple[str, int, int, int]] = field(default_factory=list)
def tally_directory(directory: Path) -> UsageTotals:
"""Return aggregated usage stats for every JSON record in a directory."""
totals = UsageTotals()
decoder = json.JSONDecoder()
for path in sorted(directory.glob("*.json")):
text = path.read_text().lstrip()
record, _ = decoder.raw_decode(text)
totals.files += 1
usage = record.get("usage")
if not usage:
totals.errors += 1
continue
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
total_tokens = usage.get("total_tokens", 0)
cached_tokens = (usage.get("prompt_tokens_details") or {}).get("cached_tokens", 0)
reasoning_tokens = (usage.get("completion_tokens_details") or {}).get("reasoning_tokens", 0)
totals.prompt_tokens += prompt_tokens
totals.completion_tokens += completion_tokens
totals.total_tokens += total_tokens
totals.cached_tokens += cached_tokens
totals.reasoning_tokens += reasoning_tokens
totals.per_file.append((path.name, prompt_tokens, completion_tokens, total_tokens))
return totals
def log_totals(label: str, totals: UsageTotals) -> None:
"""Log a one-block summary for a directory."""
counted = totals.files - totals.errors
average_total = totals.total_tokens / counted if counted else 0
logger.info("[%s]", label)
logger.info(" files : %d (with usage: %d, errors: %d)", totals.files, counted, totals.errors)
logger.info(" prompt tokens : %d", totals.prompt_tokens)
logger.info(" cached tokens : %d", totals.cached_tokens)
logger.info(" completion tok : %d", totals.completion_tokens)
logger.info(" reasoning tok : %d", totals.reasoning_tokens)
logger.info(" total tokens : %d", totals.total_tokens)
logger.info(" avg total/file : %.1f", average_total)
def main(
runs_dir: Annotated[Path, typer.Option("--runs-dir")] = Path("output/openai_runs_temp_1"),
log_level: Annotated[str, typer.Option("--log-level")] = "INFO",
) -> None:
"""Print token usage totals for the compressed and uncompressed run directories."""
logging.basicConfig(level=log_level, format="%(message)s")
grand = UsageTotals()
for label in ("compressed", "uncompressed"):
directory = runs_dir / label
if not directory.is_dir():
logger.warning("%s: directory not found at %s", label, directory)
continue
totals = tally_directory(directory)
log_totals(label, totals)
grand.files += totals.files
grand.errors += totals.errors
grand.prompt_tokens += totals.prompt_tokens
grand.cached_tokens += totals.cached_tokens
grand.completion_tokens += totals.completion_tokens
grand.reasoning_tokens += totals.reasoning_tokens
grand.total_tokens += totals.total_tokens
log_totals("grand total", grand)
if __name__ == "__main__":
typer.run(main)

View File

@@ -1,68 +0,0 @@
"""OpenAI-compatible client for vLLM's API."""
from __future__ import annotations
import logging
import time
from typing import Self
import httpx
logger = logging.getLogger(__name__)
READY_POLL_INTERVAL = 2.0
class VLLMClient:
"""Talk to a vLLM server via its OpenAI-compatible API.
Args:
host: vLLM host.
port: vLLM port.
timeout: Per-request timeout in seconds.
"""
def __init__(self, *, host: str = "localhost", port: int = 8000, timeout: int = 300) -> None:
"""Create a client connected to a vLLM server."""
self._client = httpx.Client(base_url=f"http://{host}:{port}", timeout=timeout)
def wait_ready(self, max_wait: int) -> None:
"""Poll /v1/models until the server is ready or timeout."""
deadline = time.monotonic() + max_wait
while time.monotonic() < deadline:
try:
response = self._client.get("/v1/models")
if response.is_success:
logger.info("vLLM server is ready")
return
except httpx.TransportError:
pass
time.sleep(READY_POLL_INTERVAL)
msg = f"vLLM server not ready after {max_wait}s"
raise TimeoutError(msg)
def complete(self, prompt: str, model: str, *, temperature: float = 0.0, max_tokens: int = 4096) -> str:
"""Send a prompt to /v1/completions and return the response text."""
payload = {
"model": model,
"prompt": prompt,
"temperature": temperature,
"max_tokens": max_tokens,
}
logger.info("Sending prompt to %s (%d chars)", model, len(prompt))
response = self._client.post("/v1/completions", json=payload)
response.raise_for_status()
data = response.json()
return data["choices"][0]["text"]
def close(self) -> None:
"""Close the HTTP client."""
self._client.close()
def __enter__(self) -> Self:
"""Enter the context manager."""
return self
def __exit__(self, *args: object) -> None:
"""Close the HTTP client on exit."""
self.close()

View File

@@ -0,0 +1,17 @@
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
RUN apt-get update \
&& apt-get install -y --no-install-recommends python3 python3-pip ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip3 install --no-cache-dir --upgrade pip \
&& pip3 install --no-cache-dir faster-whisper requests
WORKDIR /app
COPY python/tools/whisper/inference.py /app/inference.py
ENTRYPOINT ["python3", "/app/inference.py"]

View File

@@ -0,0 +1,2 @@
*
!python/tools/whisper/inference.py

View File

@@ -0,0 +1 @@
"""Whisper transcription tools (host orchestrator and container entrypoint)."""

View File

@@ -0,0 +1,136 @@
"""Container entrypoint that transcribes a directory of audio files with faster-whisper.
Run inside the whisper-transcribe docker image; segment timestamps are grouped
into one-minute buckets so the output reads as ``[HH:MM:00] text``.
"""
from __future__ import annotations
import argparse
import logging
from pathlib import Path
from faster_whisper import WhisperModel
logger = logging.getLogger(__name__)
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"}
BUCKET_SECONDS = 60
BEAM_SIZE = 5
SECONDS_PER_HOUR = 3600
SECONDS_PER_MINUTE = 60
def format_timestamp(total_seconds: float) -> str:
"""Render a whole-minute timestamp as ``HH:MM:00``.
Args:
total_seconds: Offset in seconds from the start of the audio.
Returns:
A zero-padded ``HH:MM:00`` string.
"""
hours = int(total_seconds // SECONDS_PER_HOUR)
minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE)
return f"{hours:02d}:{minutes:02d}:00"
def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None:
"""Transcribe one audio file and write the bucketed transcript to disk.
Args:
model: Loaded faster-whisper model.
audio_path: Source audio file.
output_path: Destination ``.txt`` path.
"""
logger.info("Transcribing %s", audio_path)
segments, info = model.transcribe(
str(audio_path),
language="en",
beam_size=BEAM_SIZE,
vad_filter=True,
)
logger.info("Duration %.1fs", info.duration)
buckets: dict[int, list[str]] = {}
for segment in segments:
bucket = int(segment.start // BUCKET_SECONDS)
buckets.setdefault(bucket, []).append(segment.text.strip())
lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)]
output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
logger.info("Wrote %s", output_path)
def find_audio_files(input_directory: Path) -> list[Path]:
"""Collect every audio file under ``input_directory``.
Args:
input_directory: Directory to walk recursively.
Returns:
Sorted list of audio file paths.
"""
return sorted(
path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
)
def configure_container_logger() -> None:
"""Configure logging for the container (stdout, INFO)."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
def parse_arguments() -> argparse.Namespace:
"""Parse CLI arguments for the container entrypoint.
Returns:
Parsed argparse namespace.
"""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input", type=Path, default=Path("/audio"))
parser.add_argument("--output", type=Path, default=Path("/output"))
parser.add_argument("--model", default="large-v3")
parser.add_argument(
"--download-only",
action="store_true",
help="Download the model into the cache volume and exit without transcribing.",
)
return parser.parse_args()
def main() -> None:
"""Load the model, then either exit (download-only) or transcribe the directory."""
configure_container_logger()
arguments = parse_arguments()
logger.info("Loading model %s on CUDA", arguments.model)
model = WhisperModel(arguments.model, device="cuda", compute_type="float16")
if arguments.download_only:
logger.info("Model ready; exiting (download-only mode)")
return
arguments.output.mkdir(parents=True, exist_ok=True)
audio_files = find_audio_files(arguments.input)
if not audio_files:
logger.warning("No audio files found in %s", arguments.input)
return
logger.info("Found %d audio file(s)", len(audio_files))
for audio_path in audio_files:
relative = audio_path.relative_to(arguments.input)
output_path = arguments.output / relative.with_suffix(".txt")
output_path.parent.mkdir(parents=True, exist_ok=True)
if output_path.exists():
logger.info("Skip %s (already transcribed)", relative)
continue
transcribe_file(model, audio_path, output_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,167 @@
"""Build and run the whisper transcription docker container on demand.
The container is started fresh for each invocation and removed on exit
(``docker run --rm``). The model is cached in a named docker volume so
only the first run pays the download cost.
"""
from __future__ import annotations
import logging
import subprocess
from pathlib import Path
from typing import Annotated
import typer
from python.common import configure_logger
logger = logging.getLogger(__name__)
class Config:
"""Paths and names for the whisper-transcribe Docker workflow."""
image_tag = "whisper-transcribe:latest"
model_volume = "whisper-models"
repo_root = Path(__file__).resolve().parents[3]
dockerfile = Path(__file__).resolve().parent / "Dockerfile"
huggingface_cache = "/root/.cache/huggingface"
def run_docker(arguments: list[str]) -> None:
"""Run a docker subcommand, streaming output and raising on failure.
Args:
arguments: Arguments to pass to the ``docker`` binary.
Raises:
subprocess.CalledProcessError: If docker exits non-zero.
"""
logger.info("docker %s", " ".join(arguments))
subprocess.run(["docker", *arguments], check=True)
def build_image() -> None:
"""Build the whisper-transcribe image using the repo root as build context."""
logger.info("Building image %s", Config.image_tag)
run_docker(
[
"build",
"--tag",
Config.image_tag,
"--file",
str(Config.dockerfile),
str(Config.repo_root),
],
)
def model_cache_present(model: str) -> bool:
"""Check whether the given model is already downloaded in the cache volume.
Args:
model: faster-whisper model name (e.g. ``large-v3``).
Returns:
True if the HuggingFace cache directory for the model exists in the volume.
"""
cache_directory = f"hub/models--Systran--faster-whisper-{model}"
completed = subprocess.run(
[
"docker",
"run",
"--rm",
"--volume",
f"{Config.model_volume}:/cache",
"alpine",
"test",
"-d",
f"/cache/{cache_directory}",
],
check=False,
)
return completed.returncode == 0
def download_model(model: str) -> None:
"""Download the model into the cache volume and exit.
Args:
model: faster-whisper model name.
"""
logger.info("Downloading model %s into volume %s", model, Config.model_volume)
run_docker(
[
"run",
"--rm",
"--device=nvidia.com/gpu=all",
"--ipc=host",
"--volume",
f"{Config.model_volume}:{Config.huggingface_cache}",
Config.image_tag,
"--model",
model,
"--download-only",
],
)
def transcribe(input_directory: Path, output_directory: Path, model: str) -> None:
"""Run transcription on every audio file under ``input_directory``.
Args:
input_directory: Host path containing audio files (mounted read-only).
output_directory: Host path for ``.txt`` transcripts.
model: faster-whisper model name.
"""
logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model)
run_docker(
[
"run",
"--rm",
"--device=nvidia.com/gpu=all",
"--ipc=host",
"--volume",
f"{input_directory}:/audio:ro",
"--volume",
f"{output_directory}:/output",
"--volume",
f"{Config.model_volume}:{Config.huggingface_cache}",
Config.image_tag,
"--model",
model,
],
)
def main(
input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")],
output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")],
model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3",
*,
force_download: Annotated[
bool,
typer.Option("--force-download", help="Re-download the model even if already cached."),
] = False,
) -> None:
"""Build the image, ensure the model is cached, then transcribe and stop."""
configure_logger()
resolved_input = input_directory.resolve(strict=True)
output_directory.mkdir(parents=True, exist_ok=True)
resolved_output = output_directory.resolve()
build_image()
if force_download or not model_cache_present(model):
download_model(model)
else:
logger.info("Model %s already cached in volume %s", model, Config.model_volume)
transcribe(resolved_input, resolved_output, model)
logger.info("Done. Container stopped.")
if __name__ == "__main__":
typer.run(main)

View File

@@ -1,8 +1,9 @@
{ inputs, pkgs, ... }: { inputs, pkgs, ... }:
{ {
imports = [ imports = [
"${inputs.self}/users/richie"
"${inputs.self}/users/math" "${inputs.self}/users/math"
"${inputs.self}/users/richie"
"${inputs.self}/users/steve"
"${inputs.self}/common/global" "${inputs.self}/common/global"
"${inputs.self}/common/optional/docker.nix" "${inputs.self}/common/optional/docker.nix"
"${inputs.self}/common/optional/scanner.nix" "${inputs.self}/common/optional/scanner.nix"

View File

@@ -28,7 +28,6 @@
allowDiscards = true; allowDiscards = true;
keyFileSize = 4096; keyFileSize = 4096;
keyFile = "/dev/disk/by-id/usb-Samsung_Flash_Drive_FIT_0374620080067131-0:0"; keyFile = "/dev/disk/by-id/usb-Samsung_Flash_Drive_FIT_0374620080067131-0:0";
fallbackToPassword = true;
}; };
}; };
kernelModules = [ "kvm-amd" ]; kernelModules = [ "kvm-amd" ];

View File

@@ -42,11 +42,13 @@
"qwen3:8b" "qwen3:8b"
"qwen3.5:27b" "qwen3.5:27b"
"qwen3.5:35b" "qwen3.5:35b"
"qwen3.6:35b"
"translategemma:12b" "translategemma:12b"
"translategemma:27b" "translategemma:27b"
"translategemma:4b" "translategemma:4b"
"rinex20/translategemma3:12b"
]; ];
models = "/zfs/models"; models = "/zfs/storage/models";
openFirewall = true; openFirewall = true;
}; };
} }

View File

@@ -1,11 +0,0 @@
#!/bin/bash
# zpools
# storage
sudo zpool create -f -o ashift=12 -O acltype=posixacl -O atime=off -O dnodesize=auto -O xattr=sa -O compression=zstd -m /zfs/storage storage mirror
sudo zpool create -o ashift=12 -O acltype=posixacl -O atime=off -O dnodesize=auto -O xattr=sa -O compression=zstd -m /zfs/storage storage
# storage datasets
sudo zfs create storage/models -o recordsize=1M

View File

@@ -24,6 +24,6 @@ monthly = 0
["root_pool/models"] ["root_pool/models"]
15_min = 4 15_min = 4
hourly = 2 hourly = 24
daily = 0 daily = 0
monthly = 0 monthly = 0

View File

@@ -31,5 +31,15 @@
]; ];
fsWatcherEnabled = true; fsWatcherEnabled = true;
}; };
"recordings" = {
path = "/home/richie/recordings";
devices = [
"jeeves"
"phone"
"rhapsody-in-green"
];
fsWatcherEnabled = true;
};
}; };
} }

View File

@@ -26,7 +26,6 @@
allowDiscards = true; allowDiscards = true;
keyFileSize = 4096; keyFileSize = 4096;
keyFile = "/dev/disk/by-id/usb-USB_SanDisk_3.2Gen1_03021630090925173333-0:0"; keyFile = "/dev/disk/by-id/usb-USB_SanDisk_3.2Gen1_03021630090925173333-0:0";
fallbackToPassword = true;
}; };
}; };
kernelModules = [ "kvm-intel" ]; kernelModules = [ "kvm-intel" ];

View File

@@ -4,9 +4,10 @@ let
in in
{ {
imports = [ imports = [
"${inputs.self}/users/richie"
"${inputs.self}/users/math"
"${inputs.self}/users/dov" "${inputs.self}/users/dov"
"${inputs.self}/users/math"
"${inputs.self}/users/richie"
"${inputs.self}/users/steve"
"${inputs.self}/common/global" "${inputs.self}/common/global"
"${inputs.self}/common/optional/docker.nix" "${inputs.self}/common/optional/docker.nix"
"${inputs.self}/common/optional/ssh_decrypt.nix" "${inputs.self}/common/optional/ssh_decrypt.nix"
@@ -15,6 +16,7 @@ in
"${inputs.self}/common/optional/zerotier.nix" "${inputs.self}/common/optional/zerotier.nix"
./docker ./docker
./services ./services
./web_services
./hardware.nix ./hardware.nix
./networking.nix ./networking.nix
./programs.nix ./programs.nix

View File

@@ -9,7 +9,6 @@ let
inherit device; inherit device;
keyFileSize = 4096; keyFileSize = 4096;
keyFile = "/dev/disk/by-id/usb-XIAO_USB_Drive_24587CE29074-0:0"; keyFile = "/dev/disk/by-id/usb-XIAO_USB_Drive_24587CE29074-0:0";
fallbackToPassword = true;
}; };
makeLuksSSD = makeLuksSSD =
device: device:

View File

@@ -1,17 +0,0 @@
{ pkgs, ... }:
let
vars = import ../vars.nix;
in
{
systemd.services.cloud_flare_tunnel = {
description = "cloud_flare_tunnel proxy's traffic through cloudflare";
after = [ "network.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
Type = "simple";
EnvironmentFile = "${vars.secrets}/docker/cloud_flare_tunnel";
ExecStart = "${pkgs.cloudflared}/bin/cloudflared --no-autoupdate tunnel run";
Restart = "on-failure";
};
};
}

View File

@@ -2,7 +2,10 @@ let
vars = import ../vars.nix; vars = import ../vars.nix;
in in
{ {
networking.firewall.allowedTCPPorts = [ 6443 ]; networking.firewall.allowedTCPPorts = [
6443
2223
];
services.gitea = { services.gitea = {
enable = true; enable = true;
@@ -24,7 +27,7 @@ in
ROOT_URL = "https://gitea.tmmworkshop.com/"; ROOT_URL = "https://gitea.tmmworkshop.com/";
HTTP_PORT = 6443; HTTP_PORT = 6443;
SSH_PORT = 2223; SSH_PORT = 2223;
SSH_LISTEN_PORT = 2224; SSH_LISTEN_PORT = 2223;
START_SSH_SERVER = true; START_SSH_SERVER = true;
PUBLIC_URL_DETECTION = "auto"; PUBLIC_URL_DETECTION = "auto";
}; };

View File

@@ -1,7 +1,6 @@
zpool = ["root_pool", "storage", "media"] zpool = ["root_pool", "storage", "media"]
services = [ services = [
"audiobookshelf", "audiobookshelf",
"cloud_flare_tunnel",
"haproxy", "haproxy",
"docker", "docker",
"home-assistant", "home-assistant",

View File

@@ -89,7 +89,16 @@ in
]; ];
fsWatcherEnabled = true; fsWatcherEnabled = true;
}; };
# "recordings" = {
path = "/home/richie/recordings";
devices = [
"bob"
"phone"
"rhapsody-in-green"
];
fsWatcherEnabled = true;
};
# davids-server
"davids-backup1" = { "davids-backup1" = {
id = "8229p-8z3tm"; # cspell:disable-line id = "8229p-8z3tm"; # cspell:disable-line
path = "${vars.syncthing}/davids_backups/1"; path = "${vars.syncthing}/davids_backups/1";

View File

@@ -0,0 +1,62 @@
let
domains = [
"audiobookshelf"
"cache"
"gitea"
"jellyfin"
"share"
];
makeCert = name: {
name = "${name}.tmmworkshop.com";
value = {
webroot = "/var/lib/acme/.challenges";
group = "acme";
reloadServices = [ "haproxy.service" ];
};
};
acmeServices = map (domain: "acme-${domain}.tmmworkshop.com.service") domains;
in
{
users.users.haproxy.extraGroups = [ "acme" ];
security.acme = {
acceptTerms = true;
defaults.email = "Richie@tmmworkshop.com";
certs = builtins.listToAttrs (map makeCert domains);
};
# Minimal nginx to serve ACME HTTP-01 challenge files for HAProxy
services.nginx = {
enable = true;
virtualHosts."acme-challenge" = {
listen = [
{
addr = "127.0.0.1";
port = 8402;
}
];
locations."/.well-known/acme-challenge/" = {
root = "/var/lib/acme/.challenges";
};
};
};
# Ensure the challenge directory exists with correct permissions
systemd.tmpfiles.rules = [
"d /var/lib/acme/.challenges 0750 acme acme - -"
"d /var/lib/acme/.challenges/.well-known 0750 acme acme - -"
"d /var/lib/acme/.challenges/.well-known/acme-challenge 0750 acme acme - -"
];
users.users.nginx.extraGroups = [ "acme" ];
# HAProxy needs certs to exist before it can bind :443.
# NixOS's acme module generates self-signed placeholders on first boot
# via acme-<domain>.service — just make HAProxy wait for them.
systemd.services.haproxy = {
after = acmeServices;
wants = acmeServices;
};
}

View File

@@ -0,0 +1,9 @@
{ lib, ... }:
{
imports =
let
files = builtins.attrNames (builtins.readDir ./.);
nixFiles = builtins.filter (name: lib.hasSuffix ".nix" name && name != "default.nix") files;
in
map (file: ./. + "/${file}") nixFiles;
}

View File

@@ -6,6 +6,7 @@ global
defaults defaults
log global log global
mode http mode http
option httplog
retries 3 retries 3
maxconn 2000 maxconn 2000
timeout connect 5s timeout connect 5s
@@ -22,25 +23,37 @@ defaults
#Application Setup #Application Setup
frontend ContentSwitching frontend ContentSwitching
bind *:80 v4v6 bind *:80 v4v6
bind *:443 v4v6 ssl crt /zfs/storage/secrets/docker/cloudflare.pem bind *:443 v4v6 ssl crt /var/lib/acme/audiobookshelf.tmmworkshop.com/full.pem crt /var/lib/acme/cache.tmmworkshop.com/full.pem crt /var/lib/acme/jellyfin.tmmworkshop.com/full.pem crt /var/lib/acme/share.tmmworkshop.com/full.pem crt /var/lib/acme/gitea.tmmworkshop.com/full.pem
mode http mode http
# ACME challenge routing (must be first)
acl is_acme path_beg /.well-known/acme-challenge/
use_backend acme_challenge if is_acme
# tmmworkshop.com # tmmworkshop.com
acl host_audiobookshelf hdr(host) -i audiobookshelf.tmmworkshop.com acl host_audiobookshelf hdr(host) -i audiobookshelf.tmmworkshop.com
acl host_cache hdr(host) -i cache.tmmworkshop.com acl host_cache hdr(host) -i cache.tmmworkshop.com
acl host_jellyfin hdr(host) -i jellyfin.tmmworkshop.com acl host_jellyfin hdr(host) -i jellyfin.tmmworkshop.com
acl host_share hdr(host) -i share.tmmworkshop.com acl host_share hdr(host) -i share.tmmworkshop.com
acl host_gcw hdr(host) -i gcw.tmmworkshop.com
acl host_n8n hdr(host) -i n8n.tmmworkshop.com
acl host_gitea hdr(host) -i gitea.tmmworkshop.com acl host_gitea hdr(host) -i gitea.tmmworkshop.com
# Hosts allowed to serve plain HTTP (add entries to skip the HTTPS redirect)
acl allow_http hdr(host) -i __none__
# acl allow_http hdr(host) -i example.tmmworkshop.com
# Redirect all HTTP to HTTPS unless on the allow list or ACME challenge
http-request redirect scheme https code 301 if !{ ssl_fc } !allow_http !is_acme
use_backend audiobookshelf_nodes if host_audiobookshelf use_backend audiobookshelf_nodes if host_audiobookshelf
use_backend cache_nodes if host_cache use_backend cache_nodes if host_cache
use_backend jellyfin if host_jellyfin use_backend jellyfin if host_jellyfin
use_backend share_nodes if host_share use_backend share_nodes if host_share
use_backend gcw_nodes if host_gcw
use_backend n8n if host_n8n
use_backend gitea if host_gitea use_backend gitea if host_gitea
backend acme_challenge
mode http
server acme 127.0.0.1:8402
backend audiobookshelf_nodes backend audiobookshelf_nodes
mode http mode http
server server 127.0.0.1:8000 server server 127.0.0.1:8000
@@ -60,14 +73,6 @@ backend share_nodes
mode http mode http
server server 127.0.0.1:8091 server server 127.0.0.1:8091
backend gcw_nodes
mode http
server server 127.0.0.1:8092
backend n8n
mode http
server server 127.0.0.1:5678
backend gitea backend gitea
mode http mode http
server server 127.0.0.1:6443 server server 127.0.0.1:6443

View File

@@ -24,11 +24,19 @@
hostId = "6404140d"; hostId = "6404140d";
firewall = { firewall = {
enable = true; enable = true;
allowedTCPPorts = [ ]; allowedTCPPorts = [
8000
8080
];
}; };
networkmanager.enable = true; networkmanager.enable = true;
}; };
programs.appimage = {
enable = true;
binfmt = true; # allows *.AppImage to be run directly
};
services = { services = {
openssh.ports = [ 922 ]; openssh.ports = [ 922 ];
flatpak.enable = true; flatpak.enable = true;

View File

@@ -7,18 +7,22 @@
}; };
environment.systemPackages = [ pkgs.kdePackages.libkscreen ]; environment.systemPackages = [ pkgs.kdePackages.libkscreen ];
boot.kernelParams = [ boot = {
kernelParams = [
"drm.edid_firmware=DP-4:edid/virtual-display.bin" "drm.edid_firmware=DP-4:edid/virtual-display.bin"
"video=DP-4:e" "video=DP-4:e"
]; ];
};
hardware = { hardware.firmware = [
firmwareCompression = "none"; (pkgs.runCommandLocal "virtual-display-edid"
firmware = [ {
(pkgs.runCommandLocal "virtual-display-edid" { } '' compressFirmware = false;
}
''
mkdir -p $out/lib/firmware/edid mkdir -p $out/lib/firmware/edid
cp ${./edid/virtual-display.bin} $out/lib/firmware/edid/virtual-display.bin cp ${./edid/virtual-display.bin} $out/lib/firmware/edid/virtual-display.bin
'') ''
)
]; ];
};
} }

View File

@@ -55,6 +55,15 @@
]; ];
fsWatcherEnabled = true; fsWatcherEnabled = true;
}; };
"recordings" = {
path = "/home/richie/recordings";
devices = [
"bob"
"jeeves"
"phone"
];
fsWatcherEnabled = true;
};
"vault" = { "vault" = {
path = "/home/richie/vault"; path = "/home/richie/vault";
devices = [ devices = [

View File

@@ -1,6 +1,7 @@
{ {
programs.git = { programs.git = {
enable = true; enable = true;
signing.format = null;
settings = { settings = {
user = { user = {
email = "dov.kruger@gmail.com"; email = "dov.kruger@gmail.com";

View File

@@ -1,6 +1,7 @@
{ {
programs.git = { programs.git = {
enable = true; enable = true;
signing.format = null;
settings = { settings = {
user = { user = {
email = "DumbPuppy208@gmail.com"; email = "DumbPuppy208@gmail.com";

View File

@@ -1,6 +1,7 @@
{ {
programs.git = { programs.git = {
enable = true; enable = true;
signing.format = null;
settings = { settings = {
user = { user = {
email = "matthew.michal11@gmail.com"; email = "matthew.michal11@gmail.com";

View File

@@ -1,6 +1,7 @@
{ {
programs.git = { programs.git = {
enable = true; enable = true;
signing.format = null;
settings = { settings = {
user = { user = {
email = "Richie@tmmworkshop.com"; email = "Richie@tmmworkshop.com";

View File

@@ -22,9 +22,10 @@
chromium chromium
# dev tools # dev tools
claude-code claude-code
codex
gparted gparted
jetbrains.datagrip jetbrains.datagrip
proxychains
opencode opencode
proxychains
]; ];
} }

View File

@@ -78,7 +78,6 @@
"Corvidae", "Corvidae",
"drivername", "drivername",
"fastapi", "fastapi",
"Qwen",
"sandboxing", "sandboxing",
"syncthing", "syncthing",
], ],
@@ -99,5 +98,4 @@
"hediet.vscode-drawio.resizeImages": null, "hediet.vscode-drawio.resizeImages": null,
"hediet.vscode-drawio.appearance": "automatic", "hediet.vscode-drawio.appearance": "automatic",
"claudeCode.preferredLocation": "panel", "claudeCode.preferredLocation": "panel",
"docker.extension.enableComposeLanguageServer": false,
} }

44
users/steve/default.nix Normal file
View File

@@ -0,0 +1,44 @@
{
pkgs,
config,
...
}:
let
ifTheyExist = groups: builtins.filter (group: builtins.hasAttr group config.users.groups) groups;
in
{
users = {
users.steve = {
isNormalUser = true;
shell = pkgs.zsh;
group = "steve";
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJH03VzDbUhzfhvwD+OsYh6GobODYaI9jdNdzWQoqFsp matth@Jove" # cspell:disable-line
];
extraGroups = [
"audio"
"video"
"wheel"
"users"
]
++ ifTheyExist [
"dialout"
"docker"
"hass"
"libvirtd"
"networkmanager"
"plugdev"
"scanner"
"transmission"
"uaccess"
"wireshark"
];
uid = 1005;
};
groups.steve.gid = 1005;
};
home-manager.users.steve = import ./systems/${config.networking.hostName}.nix;
}

View File

@@ -0,0 +1,9 @@
{
imports = [
./direnv.nix
./git.nix
./zsh.nix
];
programs.starship.enable = true;
}

View File

@@ -0,0 +1,8 @@
{
programs.direnv = {
enable = true;
enableZshIntegration = true;
nix-direnv.enable = true;
};
}

View File

@@ -0,0 +1,15 @@
{
programs.git = {
enable = true;
signing.format = null;
settings = {
user = {
email = "matthew.michal11@gmail.com";
name = "Matthew Michal";
};
pull.rebase = true;
color.ui = true;
};
lfs.enable = true;
};
}

View File

@@ -0,0 +1,28 @@
{
programs.zsh = {
enable = true;
syntaxHighlighting.enable = true;
history.size = 10000;
oh-my-zsh = {
enable = true;
plugins = [
"git"
"docker"
"docker-compose"
"colored-man-pages"
"rust"
"systemd"
"tmux"
"ufw"
"z"
];
};
shellAliases = {
"lrt" = "eza --icons -lsnew";
"ls" = "eza";
"ll" = "eza --long --group";
"la" = "eza --all";
};
};
}

View File

@@ -0,0 +1,22 @@
{ config, ... }:
{
imports = [
./cli
./programs.nix
./ssh_config.nix
];
programs = {
home-manager.enable = true;
git.enable = true;
};
home = {
username = "steve";
homeDirectory = "/home/${config.home.username}";
stateVersion = "24.05";
sessionVariables = {
FLAKE = "$HOME/dotfiles";
};
};
}

View File

@@ -0,0 +1,57 @@
{ pkgs, ... }:
{
home.packages = with pkgs; [
# cli
bat
btop
eza
fd
ffmpegthumbnailer
fzf
git
gnupg
imagemagick
jq
ncdu
ouch
p7zip
poppler
rar
ripgrep
starship
tmux
unzip
yazi
zoxide
# system info
hwloc
lynis
pciutils
smartmontools
usbutils
# networking
iperf3
nmap
wget
# python
poetry
ruff
uv
# nodejs
nodejs
# Rust packages
trunk
wasm-pack
cargo-watch
cargo-generate
cargo-audit
cargo-update
# nix
nix-init
nix-output-monitor
nix-prefetch
nix-tree
nixfmt
treefmt
];
}

View File

@@ -0,0 +1,6 @@
{
programs.ssh = {
enable = true;
enableDefaultConfig = false;
};
}

View File

@@ -0,0 +1,5 @@
{
imports = [
../home/global.nix
];
}

View File

@@ -0,0 +1,5 @@
{
imports = [
../home/global.nix
];
}