From 259e952afcd801689466482b1e58ab4963596021 Mon Sep 17 00:00:00 2001
From: Richie Cahill <Richie@tmmworkshop.com>
Date: Fri, 10 Apr 2026 20:48:24 -0400
Subject: [PATCH] added containers dir

---
 python/prompt_bench/compresion_test.py        |  1 -
 python/prompt_bench/containers/__init__.py    |  1 +
 .../finetune.py}                              | 57 ++-----------------
 python/prompt_bench/containers/lib.py         | 23 ++++++++
 .../{vllm_container.py => containers/vllm.py} | 17 ------
 python/prompt_bench/main.py                   |  3 +-
 6 files changed, 32 insertions(+), 70 deletions(-)
 create mode 100644 python/prompt_bench/containers/__init__.py
 rename python/prompt_bench/{finetune_container.py => containers/finetune.py} (68%)
 create mode 100644 python/prompt_bench/containers/lib.py
 rename python/prompt_bench/{vllm_container.py => containers/vllm.py} (76%)

diff --git a/python/prompt_bench/compresion_test.py b/python/prompt_bench/compresion_test.py
index 8d1b40f..801933e 100644
--- a/python/prompt_bench/compresion_test.py
+++ b/python/prompt_bench/compresion_test.py
@@ -34,7 +34,6 @@ DEFAULT_COUNT = 100
 SEED = 42
 
 
-
 def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]:
     """Return up to `count` (bill_id, text_content) tuples with non-empty text."""
     csv.field_size_limit(sys.maxsize)
diff --git a/python/prompt_bench/containers/__init__.py b/python/prompt_bench/containers/__init__.py
new file mode 100644
index 0000000..dc58a44
--- /dev/null
+++ b/python/prompt_bench/containers/__init__.py
@@ -0,0 +1 @@
+"""Prompt benchmarking system for evaluating LLMs via vLLM."""
diff --git a/python/prompt_bench/finetune_container.py b/python/prompt_bench/containers/finetune.py
similarity index 68%
rename from python/prompt_bench/finetune_container.py
rename to python/prompt_bench/containers/finetune.py
index 42f5444..cc20ae6 100644
--- a/python/prompt_bench/finetune_container.py
+++ b/python/prompt_bench/containers/finetune.py
@@ -9,13 +9,13 @@ from typing import Annotated
 
 import typer
 
-from python.prompt_bench.vllm_container import check_gpu_free
+from python.prompt_bench.containers.lib import check_gpu_free
 
 logger = logging.getLogger(__name__)
 
 CONTAINER_NAME = "bill-finetune"
 FINETUNE_IMAGE = "bill-finetune:latest"
-DOCKERFILE_PATH = "python/prompt_bench/Dockerfile.finetune"
+DOCKERFILE_PATH = "/home/richie/dotfiles/python/prompt_bench/Dockerfile.finetune"
 DEFAULT_HF_CACHE = Path("/zfs/models/hf")
 
 
@@ -38,13 +38,6 @@ def start_finetune(
     dataset_path: Path,
     output_dir: Path,
     hf_cache: Path = DEFAULT_HF_CACHE,
-    validation_split: float = 0.1,
-    epochs: int = 3,
-    batch_size: int = 2,
-    learning_rate: float = 2e-4,
-    lora_rank: int = 32,
-    max_seq_length: int = 4096,
-    save_gguf: bool = False,
 ) -> None:
     """Run the fine-tuning container.
 
@@ -53,12 +46,6 @@ def start_finetune(
         output_dir: Host path where the trained model will be saved.
         hf_cache: Host path to HuggingFace model cache (bind-mounted to avoid re-downloading).
         validation_split: Fraction of data held out for validation.
-        epochs: Number of training epochs.
-        batch_size: Per-device training batch size.
-        learning_rate: Learning rate for the optimizer.
-        lora_rank: LoRA adapter rank.
-        max_seq_length: Maximum sequence length for training.
-        save_gguf: Whether to also export a GGUF quantized model.
     """
     dataset_path = dataset_path.resolve()
     output_dir = output_dir.resolve()
@@ -91,30 +78,11 @@ def start_finetune(
         "/workspace/dataset.jsonl",
         "--output-dir",
         "/workspace/output/qwen-bill-summarizer",
-        "--val-split",
-        str(validation_split),
-        "--epochs",
-        str(epochs),
-        "--batch-size",
-        str(batch_size),
-        "--lr",
-        str(learning_rate),
-        "--lora-rank",
-        str(lora_rank),
-        "--max-seq-length",
-        str(max_seq_length),
     ]
 
-    if save_gguf:
-        command.append("--save-gguf")
-
     logger.info("Starting fine-tuning container")
     logger.info("  Dataset:    %s", dataset_path)
-    logger.info("  Val split:  %.0f%%", validation_split * 100)
     logger.info("  Output:     %s", output_dir)
-    logger.info("  Epochs:     %d", epochs)
-    logger.info("  Batch size: %d", batch_size)
-    logger.info("  LoRA rank:  %d", lora_rank)
 
     result = subprocess.run(command, text=True, check=False)
     if result.returncode != 0:
@@ -154,18 +122,13 @@ def build() -> None:
 
 @app.command()
 def run(
-    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path("output/finetune_dataset.jsonl"),
+    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path(
+        "/home/richie/dotfiles/data/finetune_dataset.jsonl"
+    ),
     output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
-        "output/qwen-bill-summarizer",
+        "/home/richie/dotfiles/data/output/qwen-bill-summarizer",
     ),
     hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
-    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
-    epochs: Annotated[int, typer.Option(help="Training epochs")] = 3,
-    batch_size: Annotated[int, typer.Option(help="Per-device batch size")] = 2,
-    learning_rate: Annotated[float, typer.Option("--lr", help="Learning rate")] = 2e-4,
-    lora_rank: Annotated[int, typer.Option(help="LoRA rank")] = 32,
-    max_seq_length: Annotated[int, typer.Option(help="Max sequence length")] = 4096,
-    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
     log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
     """Run fine-tuning inside a Docker container."""
@@ -175,16 +138,8 @@ def run(
         dataset_path=dataset,
         output_dir=output_dir,
         hf_cache=hf_cache,
-        validation_split=validation_split,
-        epochs=epochs,
-        batch_size=batch_size,
-        learning_rate=learning_rate,
-        lora_rank=lora_rank,
-        max_seq_length=max_seq_length,
-        save_gguf=save_gguf,
     )
 
-
 @app.command()
 def stop() -> None:
     """Stop and remove the fine-tuning container."""
diff --git a/python/prompt_bench/containers/lib.py b/python/prompt_bench/containers/lib.py
new file mode 100644
index 0000000..b1ba5c4
--- /dev/null
+++ b/python/prompt_bench/containers/lib.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import logging
+import subprocess
+
+logger = logging.getLogger(__name__)
+
+
+def check_gpu_free() -> None:
+    """Warn if GPU-heavy processes (e.g. Ollama) are running."""
+    result = subprocess.run(
+        ["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        logger.warning("Could not query GPU processes: %s", result.stderr.strip())
+        return
+    processes = result.stdout.strip()
+    if processes:
+        logger.warning("GPU processes detected:\n%s", processes)
+        logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")
diff --git a/python/prompt_bench/vllm_container.py b/python/prompt_bench/containers/vllm.py
similarity index 76%
rename from python/prompt_bench/vllm_container.py
rename to python/prompt_bench/containers/vllm.py
index f29ce7d..33e7e31 100644
--- a/python/prompt_bench/vllm_container.py
+++ b/python/prompt_bench/containers/vllm.py
@@ -68,20 +68,3 @@ def stop_vllm() -> None:
         check=False,
     )
     logger.info("vLLM container stopped and removed")
-
-
-def check_gpu_free() -> None:
-    """Warn if GPU-heavy processes (e.g. Ollama) are running."""
-    result = subprocess.run(
-        ["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
-        capture_output=True,
-        text=True,
-        check=False,
-    )
-    if result.returncode != 0:
-        logger.warning("Could not query GPU processes: %s", result.stderr.strip())
-        return
-    processes = result.stdout.strip()
-    if processes:
-        logger.warning("GPU processes detected:\n%s", processes)
-        logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")
diff --git a/python/prompt_bench/main.py b/python/prompt_bench/main.py
index cb2e7d5..50bd04e 100644
--- a/python/prompt_bench/main.py
+++ b/python/prompt_bench/main.py
@@ -12,7 +12,8 @@ from typing import Annotated
 
 import typer
 
-from python.prompt_bench.vllm_container import check_gpu_free, start_vllm, stop_vllm
+from python.prompt_bench.containers.lib import check_gpu_free
+from python.prompt_bench.containers.vllm import start_vllm, stop_vllm
 from python.prompt_bench.downloader import is_model_present
 from python.prompt_bench.models import BenchmarkConfig
 from python.prompt_bench.vllm_client import VLLMClient