moved prompt_bench

2026-04-14 18:18:31 -04:00
parent 2abd61d3b1
commit b8d64a5b19
39 changed files with 139 additions and 50 deletions
--- a/pipelines/prompt_bench/Dockerfile.finetune
+++ b/pipelines/prompt_bench/Dockerfile.finetune
@@ -22,4 +22,4 @@ COPY config/prompts/summarization_prompts.toml config/prompts/summarization_prom
 COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
 COPY python/__init__.py python/__init__.py
-ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
+ENTRYPOINT ["python", "-m", "pipelines.prompt_bench.finetune"]
--- a/pipelines/prompt_bench/init.py
+++ b/pipelines/prompt_bench/init.py
--- a/pipelines/prompt_bench/batch_bill_summarizer.py
+++ b/pipelines/prompt_bench/batch_bill_summarizer.py
@@ -23,9 +23,14 @@ import httpx
 import typer
 from tiktoken import Encoding, get_encoding
-from python.prompt_bench.bill_token_compression import compress_bill_text
+from pipelines.prompt_bench.bill_token_compression import compress_bill_text
-_PROMPTS_PATH = Path(__file__).resolve().parents[2] / "config" / "prompts" / "summarization_prompts.toml"
+_PROMPTS_PATH = (
    Path(__file__).resolve().parents[2]
    / "config"
    / "prompts"
    / "summarization_prompts.toml"
 )
 _PROMPTS = tomllib.loads(_PROMPTS_PATH.read_text())["summarization"]
 SUMMARIZATION_SYSTEM_PROMPT: str = _PROMPTS["system_prompt"]
 SUMMARIZATION_USER_TEMPLATE: str = _PROMPTS["user_template"]
@@ -72,7 +77,12 @@ def build_request(custom_id: str, model: str, bill_text: str) -> dict:
            "model": model,
            "messages": [
                {"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
-                {"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
+                {
                    "role": "user",
                    "content": SUMMARIZATION_USER_TEMPLATE.format(
                        text_content=bill_text
                    ),
                },
            ],
        },
    }
@@ -123,7 +133,9 @@ def prepare_requests(
                "compressed_chars": len(compressed_text),
                "raw_tokens": raw_token_count,
                "compressed_tokens": compressed_token_count,
-                "token_ratio": (compressed_token_count / raw_token_count) if raw_token_count else None,
+                "token_ratio": (compressed_token_count / raw_token_count)
                if raw_token_count
                else None,
            },
        )
        safe_id = safe_filename(bill_id)
@@ -136,7 +148,14 @@ def write_token_csv(path: Path, token_rows: list[dict]) -> tuple[int, int]:
    with path.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(
            handle,
-            fieldnames=["bill_id", "raw_chars", "compressed_chars", "raw_tokens", "compressed_tokens", "token_ratio"],
+            fieldnames=[
                "bill_id",
                "raw_chars",
                "compressed_chars",
                "raw_tokens",
                "compressed_tokens",
                "token_ratio",
            ],
        )
        writer.writeheader()
        writer.writerows(token_rows)
@@ -161,8 +180,12 @@ def create_batch(client: httpx.Client, input_file_id: str, description: str) ->
 def main(
-    csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
+    csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path(
-    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")] = Path(
+        "bills.csv"
    ),
    output_dir: Annotated[
        Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")
    ] = Path(
        "output/openai_batch",
    ),
    model: Annotated[str, typer.Option(help="OpenAI model id")] = "gpt-5-mini",
@@ -170,7 +193,9 @@ def main(
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Submit an OpenAI Batch job of compressed bill summaries."""
-    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+    logging.basicConfig(
        level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
    if not api_key:
@@ -191,7 +216,9 @@ def main(
    request_lines, token_rows = prepare_requests(bills, model=model, encoder=encoder)
    token_csv_path = output_dir / "token_counts.csv"
-    raw_tokens_total, compressed_tokens_total = write_token_csv(token_csv_path, token_rows)
+    raw_tokens_total, compressed_tokens_total = write_token_csv(
        token_csv_path, token_rows
    )
    logger.info(
        "Token counts: raw=%d compressed=%d ratio=%.3f -> %s",
        raw_tokens_total,
@@ -211,7 +238,11 @@ def main(
        logger.info("Uploaded: %s", file_id)
        logger.info("Creating batch")
-        batch = create_batch(client, file_id, f"compressed bill summaries x{len(request_lines)} ({model})")
+        batch = create_batch(
            client,
            file_id,
            f"compressed bill summaries x{len(request_lines)} ({model})",
        )
        logger.info("Batch created: %s", batch["id"])
    metadata = {
--- a/pipelines/prompt_bench/bill_token_compression.py
+++ b/pipelines/prompt_bench/bill_token_compression.py
--- a/pipelines/prompt_bench/compresion_test.py
+++ b/pipelines/prompt_bench/compresion_test.py
@@ -24,9 +24,14 @@ from typing import Annotated
 import httpx
 import typer
-from python.prompt_bench.bill_token_compression import compress_bill_text
+from pipelines.prompt_bench.bill_token_compression import compress_bill_text
-_PROMPTS_PATH = Path(__file__).resolve().parents[2] / "config" / "prompts" / "summarization_prompts.toml"
+_PROMPTS_PATH = (
    Path(__file__).resolve().parents[2]
    / "config"
    / "prompts"
    / "summarization_prompts.toml"
 )
 _PROMPTS = tomllib.loads(_PROMPTS_PATH.read_text())["summarization"]
 SUMMARIZATION_SYSTEM_PROMPT: str = _PROMPTS["system_prompt"]
 SUMMARIZATION_USER_TEMPLATE: str = _PROMPTS["user_template"]
@@ -62,7 +67,10 @@ def build_messages(bill_text: str) -> list[dict]:
    """Return the system + user message pair for a bill."""
    return [
        {"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
-        {"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
+        {
            "role": "user",
            "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text),
        },
    ]
@@ -132,17 +140,25 @@ def run_one_request(
 def main(
-    csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
+    csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path(
-    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
+        "bills.csv"
    ),
    output_dir: Annotated[
        Path, typer.Option("--output-dir", help="Where to write per-request JSON")
    ] = Path(
        "output/openai_runs",
    ),
    model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
    count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
-    concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
+    concurrency: Annotated[
        int, typer.Option(help="Concurrent in-flight requests")
    ] = 16,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
-    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+    logging.basicConfig(
        level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
    if not api_key:
@@ -165,8 +181,17 @@ def main(
    tasks: list[tuple[str, str, str, Path]] = []
    for bill_id, text_content in bills:
        filename = f"{safe_filename(bill_id)}.json"
-        tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
+        tasks.append(
-        tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
+            (
                bill_id,
                "compressed",
                compress_bill_text(text_content),
                compressed_dir / filename,
            )
        )
        tasks.append(
            (bill_id, "uncompressed", text_content, uncompressed_dir / filename)
        )
    logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)
--- a/pipelines/prompt_bench/containers/init.py
+++ b/pipelines/prompt_bench/containers/init.py
--- a/pipelines/prompt_bench/containers/finetune.py
+++ b/pipelines/prompt_bench/containers/finetune.py
@@ -9,7 +9,7 @@ from typing import Annotated
 import typer
-from python.prompt_bench.containers.lib import check_gpu_free
+from pipelines.prompt_bench.containers.lib import check_gpu_free
 logger = logging.getLogger(__name__)
@@ -95,7 +95,9 @@ def stop_finetune() -> None:
    """Stop and remove the fine-tuning container."""
    logger.info("Stopping fine-tuning container")
    subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
-    subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
+    subprocess.run(
        ["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False
    )
 def logs_finetune() -> str | None:
@@ -125,14 +127,20 @@ def run(
    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path(
        "/home/richie/dotfiles/data/finetune_dataset.jsonl"
    ),
-    output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
+    output_dir: Annotated[
        Path, typer.Option(help="Where to save the trained model")
    ] = Path(
        "/home/richie/dotfiles/data/output/qwen-bill-summarizer",
    ),
-    hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
+    hf_cache: Annotated[
        Path, typer.Option(help="Host path to HuggingFace model cache")
    ] = DEFAULT_HF_CACHE,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run fine-tuning inside a Docker container."""
-    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+    logging.basicConfig(
        level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    check_gpu_free()
    start_finetune(
        dataset_path=dataset,
@@ -140,6 +148,7 @@ def run(
        hf_cache=hf_cache,
    )
@app.command()
 def stop() -> None:
    """Stop and remove the fine-tuning container."""
--- a/pipelines/prompt_bench/containers/lib.py
+++ b/pipelines/prompt_bench/containers/lib.py
--- a/pipelines/prompt_bench/containers/vllm.py
+++ b/pipelines/prompt_bench/containers/vllm.py
--- a/pipelines/prompt_bench/downloader.py
+++ b/pipelines/prompt_bench/downloader.py
@@ -9,7 +9,7 @@ from typing import Annotated
 import typer
 from huggingface_hub import snapshot_download
-from python.prompt_bench.models import BenchmarkConfig
+from pipelines.prompt_bench.models import BenchmarkConfig
 logger = logging.getLogger(__name__)
@@ -52,11 +52,15 @@ def download_all(config: BenchmarkConfig) -> None:
 def main(
-    config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
+    config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path(
        "bench.toml"
    ),
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Download all models listed in the benchmark config."""
-    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+    logging.basicConfig(
        level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    if not config.is_file():
        message = f"Config file does not exist: {config}"
--- a/pipelines/prompt_bench/finetune.py
+++ b/pipelines/prompt_bench/finetune.py
@@ -5,7 +5,7 @@ applies QLoRA with 4-bit quantization, and saves the merged model
 in HuggingFace format. Designed for a single RTX 3090 (24GB).
 Usage:
-    python -m python.prompt_bench.finetune \
+    python -m pipelines.prompt_bench.finetune \
        --dataset output/finetune_dataset.jsonl \
        --output-dir output/qwen-bill-summarizer
 """
@@ -107,21 +107,31 @@ def load_dataset_from_jsonl(path: Path) -> Dataset:
 def main(
-    dataset_path: Annotated[Path, typer.Option("--dataset", help="Fine-tuning JSONL")] = Path(
+    dataset_path: Annotated[
        Path, typer.Option("--dataset", help="Fine-tuning JSONL")
    ] = Path(
        "output/finetune_dataset.jsonl",
    ),
-    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
+    validation_split: Annotated[
-    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to save the merged model")] = Path(
+        float, typer.Option("--val-split", help="Fraction held out for validation")
    ] = 0.1,
    output_dir: Annotated[
        Path, typer.Option("--output-dir", help="Where to save the merged model")
    ] = Path(
        "output/qwen-bill-summarizer",
    ),
    config_path: Annotated[
        Path,
        typer.Option("--config", help="TOML config file"),
    ] = Path(__file__).parent / "config.toml",
-    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
+    save_gguf: Annotated[
        bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")
    ] = False,
 ) -> None:
    """Fine-tune Qwen 3.5 4B on bill summarization with Unsloth + QLoRA."""
-    logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+    logging.basicConfig(
        level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    if not dataset_path.is_file():
        message = f"Dataset not found: {dataset_path}"
@@ -137,7 +147,9 @@ def main(
        dtype=None,
    )
-    logger.info("Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha)
+    logger.info(
        "Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r=config.lora.rank,
@@ -153,7 +165,9 @@ def main(
    split = full_dataset.train_test_split(test_size=validation_split, seed=42)
    train_dataset = split["train"]
    validation_dataset = split["test"]
-    logger.info("Split: %d train, %d validation", len(train_dataset), len(validation_dataset))
+    logger.info(
        "Split: %d train, %d validation", len(train_dataset), len(validation_dataset)
    )
    training_args = TrainingArguments(
        output_dir=str(output_dir / "checkpoints"),
        num_train_epochs=config.training.epochs,
--- a/pipelines/prompt_bench/main.py
+++ b/pipelines/prompt_bench/main.py
@@ -11,11 +11,11 @@ from typing import Annotated
 import typer
-from python.prompt_bench.containers.lib import check_gpu_free
+from pipelines.prompt_bench.containers.lib import check_gpu_free
-from python.prompt_bench.containers.vllm import start_vllm, stop_vllm
+from pipelines.prompt_bench.containers.vllm import start_vllm, stop_vllm
-from python.prompt_bench.downloader import is_model_present
+from pipelines.prompt_bench.downloader import is_model_present
-from python.prompt_bench.models import BenchmarkConfig
+from pipelines.prompt_bench.models import BenchmarkConfig
-from python.prompt_bench.vllm_client import VLLMClient
+from pipelines.prompt_bench.vllm_client import VLLMClient
 logger = logging.getLogger(__name__)
@@ -72,7 +72,9 @@ def benchmark_model(
    vLLM batches concurrent requests internally, so submitting many at once is
    significantly faster than running them serially.
    """
-    pending = [prompt for prompt in prompts if not (model_output / prompt.name).exists()]
+    pending = [
        prompt for prompt in prompts if not (model_output / prompt.name).exists()
    ]
    skipped = len(prompts) - len(pending)
    if skipped:
        logger.info("Skipping %d prompts with existing output for %s", skipped, repo)
@@ -185,13 +187,21 @@ def run_benchmark(
 def main(
-    input_dir: Annotated[Path, typer.Argument(help="Directory containing input .txt prompt files")],
+    input_dir: Annotated[
-    config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
+        Path, typer.Argument(help="Directory containing input .txt prompt files")
-    output_dir: Annotated[Path, typer.Option(help="Output directory for results")] = Path("output"),
+    ],
    config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path(
        "bench.toml"
    ),
    output_dir: Annotated[
        Path, typer.Option(help="Output directory for results")
    ] = Path("output"),
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run prompts through multiple LLMs via vLLM and save results."""
-    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+    logging.basicConfig(
        level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    if not input_dir.is_dir():
        message = f"Input directory does not exist: {input_dir}"
--- a/pipelines/prompt_bench/models.py
+++ b/pipelines/prompt_bench/models.py
--- a/pipelines/prompt_bench/summarization_prompts.py
+++ b/pipelines/prompt_bench/summarization_prompts.py
--- a/pipelines/prompt_bench/tools/build_finetune_dataset.py
+++ b/pipelines/prompt_bench/tools/build_finetune_dataset.py
--- a/pipelines/prompt_bench/tools/count_tokens.py
+++ b/pipelines/prompt_bench/tools/count_tokens.py
--- a/pipelines/prompt_bench/vllm_client.py
+++ b/pipelines/prompt_bench/vllm_client.py
--- a/prompt_bench/pycache/init.cpython-314.pyc
+++ b/prompt_bench/pycache/init.cpython-314.pyc
--- a/prompt_bench/pycache/batch_bill_summarizer.cpython-314.pyc
+++ b/prompt_bench/pycache/batch_bill_summarizer.cpython-314.pyc
--- a/prompt_bench/pycache/batch_compresion_test.cpython-314.pyc
+++ b/prompt_bench/pycache/batch_compresion_test.cpython-314.pyc
--- a/prompt_bench/pycache/batch_openai.cpython-314.pyc
+++ b/prompt_bench/pycache/batch_openai.cpython-314.pyc
--- a/prompt_bench/pycache/bill_token_compression.cpython-314.pyc
+++ b/prompt_bench/pycache/bill_token_compression.cpython-314.pyc
--- a/prompt_bench/pycache/build_finetune_dataset.cpython-314.pyc
+++ b/prompt_bench/pycache/build_finetune_dataset.cpython-314.pyc
--- a/prompt_bench/pycache/compression.cpython-314.pyc
+++ b/prompt_bench/pycache/compression.cpython-314.pyc
--- a/prompt_bench/pycache/container.cpython-314.pyc
+++ b/prompt_bench/pycache/container.cpython-314.pyc
--- a/prompt_bench/pycache/downloader.cpython-314.pyc
+++ b/prompt_bench/pycache/downloader.cpython-314.pyc
--- a/prompt_bench/pycache/finetune_container.cpython-314.pyc
+++ b/prompt_bench/pycache/finetune_container.cpython-314.pyc
--- a/prompt_bench/pycache/main.cpython-314.pyc
+++ b/prompt_bench/pycache/main.cpython-314.pyc
--- a/prompt_bench/pycache/models.cpython-314.pyc
+++ b/prompt_bench/pycache/models.cpython-314.pyc
--- a/prompt_bench/pycache/summarization_prompts.cpython-314.pyc
+++ b/prompt_bench/pycache/summarization_prompts.cpython-314.pyc
--- a/prompt_bench/pycache/vllm_client.cpython-314.pyc
+++ b/prompt_bench/pycache/vllm_client.cpython-314.pyc
--- a/prompt_bench/pycache/vllm_container.cpython-314.pyc
+++ b/prompt_bench/pycache/vllm_container.cpython-314.pyc
--- a/prompt_bench/containers/pycache/init.cpython-314.pyc
+++ b/prompt_bench/containers/pycache/init.cpython-314.pyc
--- a/prompt_bench/containers/pycache/finetune.cpython-314.pyc
+++ b/prompt_bench/containers/pycache/finetune.cpython-314.pyc
--- a/prompt_bench/containers/pycache/lib.cpython-314.pyc
+++ b/prompt_bench/containers/pycache/lib.cpython-314.pyc
--- a/prompt_bench/input/1.txt
+++ b/prompt_bench/input/1.txt
@@ -1 +0,0 @@
 how many oceans are there in the world
--- a/prompt_bench/input/2.txt
+++ b/prompt_bench/input/2.txt
@@ -1 +0,0 @@
 whos the president of the united states
--- a/prompt_bench/input/3.txt
+++ b/prompt_bench/input/3.txt
@@ -1 +0,0 @@
 whats the greatest country in the world
--- a/prompt_bench/input/4.txt
+++ b/prompt_bench/input/4.txt
@@ -1 +0,0 @@
 was/is the usa the greatest country in the world
		`@@ -1 +0,0 @@`
			`was/is the usa the greatest country in the world`