moved prompt_bench

This commit is contained in:
2026-04-14 18:18:31 -04:00
parent 2abd61d3b1
commit b8d64a5b19
39 changed files with 139 additions and 50 deletions

View File

@@ -22,4 +22,4 @@ COPY config/prompts/summarization_prompts.toml config/prompts/summarization_prom
COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
COPY python/__init__.py python/__init__.py
ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
ENTRYPOINT ["python", "-m", "pipelines.prompt_bench.finetune"]

View File

@@ -23,9 +23,14 @@ import httpx
import typer
from tiktoken import Encoding, get_encoding
from python.prompt_bench.bill_token_compression import compress_bill_text
from pipelines.prompt_bench.bill_token_compression import compress_bill_text
_PROMPTS_PATH = Path(__file__).resolve().parents[2] / "config" / "prompts" / "summarization_prompts.toml"
_PROMPTS_PATH = (
Path(__file__).resolve().parents[2]
/ "config"
/ "prompts"
/ "summarization_prompts.toml"
)
_PROMPTS = tomllib.loads(_PROMPTS_PATH.read_text())["summarization"]
SUMMARIZATION_SYSTEM_PROMPT: str = _PROMPTS["system_prompt"]
SUMMARIZATION_USER_TEMPLATE: str = _PROMPTS["user_template"]
@@ -72,7 +77,12 @@ def build_request(custom_id: str, model: str, bill_text: str) -> dict:
"model": model,
"messages": [
{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
{"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
{
"role": "user",
"content": SUMMARIZATION_USER_TEMPLATE.format(
text_content=bill_text
),
},
],
},
}
@@ -123,7 +133,9 @@ def prepare_requests(
"compressed_chars": len(compressed_text),
"raw_tokens": raw_token_count,
"compressed_tokens": compressed_token_count,
"token_ratio": (compressed_token_count / raw_token_count) if raw_token_count else None,
"token_ratio": (compressed_token_count / raw_token_count)
if raw_token_count
else None,
},
)
safe_id = safe_filename(bill_id)
@@ -136,7 +148,14 @@ def write_token_csv(path: Path, token_rows: list[dict]) -> tuple[int, int]:
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(
handle,
fieldnames=["bill_id", "raw_chars", "compressed_chars", "raw_tokens", "compressed_tokens", "token_ratio"],
fieldnames=[
"bill_id",
"raw_chars",
"compressed_chars",
"raw_tokens",
"compressed_tokens",
"token_ratio",
],
)
writer.writeheader()
writer.writerows(token_rows)
@@ -161,8 +180,12 @@ def create_batch(client: httpx.Client, input_file_id: str, description: str) ->
def main(
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")] = Path(
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path(
"bills.csv"
),
output_dir: Annotated[
Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")
] = Path(
"output/openai_batch",
),
model: Annotated[str, typer.Option(help="OpenAI model id")] = "gpt-5-mini",
@@ -170,7 +193,9 @@ def main(
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Submit an OpenAI Batch job of compressed bill summaries."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logging.basicConfig(
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
)
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
if not api_key:
@@ -191,7 +216,9 @@ def main(
request_lines, token_rows = prepare_requests(bills, model=model, encoder=encoder)
token_csv_path = output_dir / "token_counts.csv"
raw_tokens_total, compressed_tokens_total = write_token_csv(token_csv_path, token_rows)
raw_tokens_total, compressed_tokens_total = write_token_csv(
token_csv_path, token_rows
)
logger.info(
"Token counts: raw=%d compressed=%d ratio=%.3f -> %s",
raw_tokens_total,
@@ -211,7 +238,11 @@ def main(
logger.info("Uploaded: %s", file_id)
logger.info("Creating batch")
batch = create_batch(client, file_id, f"compressed bill summaries x{len(request_lines)} ({model})")
batch = create_batch(
client,
file_id,
f"compressed bill summaries x{len(request_lines)} ({model})",
)
logger.info("Batch created: %s", batch["id"])
metadata = {

View File

@@ -24,9 +24,14 @@ from typing import Annotated
import httpx
import typer
from python.prompt_bench.bill_token_compression import compress_bill_text
from pipelines.prompt_bench.bill_token_compression import compress_bill_text
_PROMPTS_PATH = Path(__file__).resolve().parents[2] / "config" / "prompts" / "summarization_prompts.toml"
_PROMPTS_PATH = (
Path(__file__).resolve().parents[2]
/ "config"
/ "prompts"
/ "summarization_prompts.toml"
)
_PROMPTS = tomllib.loads(_PROMPTS_PATH.read_text())["summarization"]
SUMMARIZATION_SYSTEM_PROMPT: str = _PROMPTS["system_prompt"]
SUMMARIZATION_USER_TEMPLATE: str = _PROMPTS["user_template"]
@@ -62,7 +67,10 @@ def build_messages(bill_text: str) -> list[dict]:
"""Return the system + user message pair for a bill."""
return [
{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
{"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
{
"role": "user",
"content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text),
},
]
@@ -132,17 +140,25 @@ def run_one_request(
def main(
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path(
"bills.csv"
),
output_dir: Annotated[
Path, typer.Option("--output-dir", help="Where to write per-request JSON")
] = Path(
"output/openai_runs",
),
model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
concurrency: Annotated[
int, typer.Option(help="Concurrent in-flight requests")
] = 16,
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logging.basicConfig(
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
)
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
if not api_key:
@@ -165,8 +181,17 @@ def main(
tasks: list[tuple[str, str, str, Path]] = []
for bill_id, text_content in bills:
filename = f"{safe_filename(bill_id)}.json"
tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
tasks.append(
(
bill_id,
"compressed",
compress_bill_text(text_content),
compressed_dir / filename,
)
)
tasks.append(
(bill_id, "uncompressed", text_content, uncompressed_dir / filename)
)
logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)

View File

@@ -9,7 +9,7 @@ from typing import Annotated
import typer
from python.prompt_bench.containers.lib import check_gpu_free
from pipelines.prompt_bench.containers.lib import check_gpu_free
logger = logging.getLogger(__name__)
@@ -95,7 +95,9 @@ def stop_finetune() -> None:
"""Stop and remove the fine-tuning container."""
logger.info("Stopping fine-tuning container")
subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
subprocess.run(
["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False
)
def logs_finetune() -> str | None:
@@ -125,14 +127,20 @@ def run(
dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path(
"/home/richie/dotfiles/data/finetune_dataset.jsonl"
),
output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
output_dir: Annotated[
Path, typer.Option(help="Where to save the trained model")
] = Path(
"/home/richie/dotfiles/data/output/qwen-bill-summarizer",
),
hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
hf_cache: Annotated[
Path, typer.Option(help="Host path to HuggingFace model cache")
] = DEFAULT_HF_CACHE,
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Run fine-tuning inside a Docker container."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logging.basicConfig(
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
)
check_gpu_free()
start_finetune(
dataset_path=dataset,
@@ -140,6 +148,7 @@ def run(
hf_cache=hf_cache,
)
@app.command()
def stop() -> None:
"""Stop and remove the fine-tuning container."""

View File

@@ -9,7 +9,7 @@ from typing import Annotated
import typer
from huggingface_hub import snapshot_download
from python.prompt_bench.models import BenchmarkConfig
from pipelines.prompt_bench.models import BenchmarkConfig
logger = logging.getLogger(__name__)
@@ -52,11 +52,15 @@ def download_all(config: BenchmarkConfig) -> None:
def main(
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path(
"bench.toml"
),
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Download all models listed in the benchmark config."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logging.basicConfig(
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
)
if not config.is_file():
message = f"Config file does not exist: {config}"

View File

@@ -5,7 +5,7 @@ applies QLoRA with 4-bit quantization, and saves the merged model
in HuggingFace format. Designed for a single RTX 3090 (24GB).
Usage:
python -m python.prompt_bench.finetune \
python -m pipelines.prompt_bench.finetune \
--dataset output/finetune_dataset.jsonl \
--output-dir output/qwen-bill-summarizer
"""
@@ -107,21 +107,31 @@ def load_dataset_from_jsonl(path: Path) -> Dataset:
def main(
dataset_path: Annotated[Path, typer.Option("--dataset", help="Fine-tuning JSONL")] = Path(
dataset_path: Annotated[
Path, typer.Option("--dataset", help="Fine-tuning JSONL")
] = Path(
"output/finetune_dataset.jsonl",
),
validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to save the merged model")] = Path(
validation_split: Annotated[
float, typer.Option("--val-split", help="Fraction held out for validation")
] = 0.1,
output_dir: Annotated[
Path, typer.Option("--output-dir", help="Where to save the merged model")
] = Path(
"output/qwen-bill-summarizer",
),
config_path: Annotated[
Path,
typer.Option("--config", help="TOML config file"),
] = Path(__file__).parent / "config.toml",
save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
save_gguf: Annotated[
bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")
] = False,
) -> None:
"""Fine-tune Qwen 3.5 4B on bill summarization with Unsloth + QLoRA."""
logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logging.basicConfig(
level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s"
)
if not dataset_path.is_file():
message = f"Dataset not found: {dataset_path}"
@@ -137,7 +147,9 @@ def main(
dtype=None,
)
logger.info("Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha)
logger.info(
"Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha
)
model = FastLanguageModel.get_peft_model(
model,
r=config.lora.rank,
@@ -153,7 +165,9 @@ def main(
split = full_dataset.train_test_split(test_size=validation_split, seed=42)
train_dataset = split["train"]
validation_dataset = split["test"]
logger.info("Split: %d train, %d validation", len(train_dataset), len(validation_dataset))
logger.info(
"Split: %d train, %d validation", len(train_dataset), len(validation_dataset)
)
training_args = TrainingArguments(
output_dir=str(output_dir / "checkpoints"),
num_train_epochs=config.training.epochs,

View File

@@ -11,11 +11,11 @@ from typing import Annotated
import typer
from python.prompt_bench.containers.lib import check_gpu_free
from python.prompt_bench.containers.vllm import start_vllm, stop_vllm
from python.prompt_bench.downloader import is_model_present
from python.prompt_bench.models import BenchmarkConfig
from python.prompt_bench.vllm_client import VLLMClient
from pipelines.prompt_bench.containers.lib import check_gpu_free
from pipelines.prompt_bench.containers.vllm import start_vllm, stop_vllm
from pipelines.prompt_bench.downloader import is_model_present
from pipelines.prompt_bench.models import BenchmarkConfig
from pipelines.prompt_bench.vllm_client import VLLMClient
logger = logging.getLogger(__name__)
@@ -72,7 +72,9 @@ def benchmark_model(
vLLM batches concurrent requests internally, so submitting many at once is
significantly faster than running them serially.
"""
pending = [prompt for prompt in prompts if not (model_output / prompt.name).exists()]
pending = [
prompt for prompt in prompts if not (model_output / prompt.name).exists()
]
skipped = len(prompts) - len(pending)
if skipped:
logger.info("Skipping %d prompts with existing output for %s", skipped, repo)
@@ -185,13 +187,21 @@ def run_benchmark(
def main(
input_dir: Annotated[Path, typer.Argument(help="Directory containing input .txt prompt files")],
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
output_dir: Annotated[Path, typer.Option(help="Output directory for results")] = Path("output"),
input_dir: Annotated[
Path, typer.Argument(help="Directory containing input .txt prompt files")
],
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path(
"bench.toml"
),
output_dir: Annotated[
Path, typer.Option(help="Output directory for results")
] = Path("output"),
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
) -> None:
"""Run prompts through multiple LLMs via vLLM and save results."""
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logging.basicConfig(
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
)
if not input_dir.is_dir():
message = f"Input directory does not exist: {input_dir}"

View File

@@ -1 +0,0 @@
how many oceans are there in the world

View File

@@ -1 +0,0 @@
whos the president of the united states

View File

@@ -1 +0,0 @@
whats the greatest country in the world

View File

@@ -1 +0,0 @@
was/is the usa the greatest country in the world