added containers dir

2026-04-17 21:18:18 -04:00 · 2026-04-10 20:48:24 -04:00
parent 4a10a80ba0
commit 259e952afc
6 changed files with 32 additions and 70 deletions
--- a/python/prompt_bench/compresion_test.py
+++ b/python/prompt_bench/compresion_test.py
@@ -34,7 +34,6 @@ DEFAULT_COUNT = 100
 SEED = 42
 def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]:
    """Return up to `count` (bill_id, text_content) tuples with non-empty text."""
    csv.field_size_limit(sys.maxsize)
--- a/python/prompt_bench/containers/init.py
+++ b/python/prompt_bench/containers/init.py
@@ -0,0 +1 @@
 """Prompt benchmarking system for evaluating LLMs via vLLM."""
--- a/python/prompt_bench/containers/finetune.py
+++ b/python/prompt_bench/containers/finetune.py
@@ -9,13 +9,13 @@ from typing import Annotated
 import typer
-from python.prompt_bench.vllm_container import check_gpu_free
+from python.prompt_bench.containers.lib import check_gpu_free
 logger = logging.getLogger(__name__)
 CONTAINER_NAME = "bill-finetune"
 FINETUNE_IMAGE = "bill-finetune:latest"
-DOCKERFILE_PATH = "python/prompt_bench/Dockerfile.finetune"
+DOCKERFILE_PATH = "/home/richie/dotfiles/python/prompt_bench/Dockerfile.finetune"
 DEFAULT_HF_CACHE = Path("/zfs/models/hf")
@@ -38,13 +38,6 @@ def start_finetune(
    dataset_path: Path,
    output_dir: Path,
    hf_cache: Path = DEFAULT_HF_CACHE,
    validation_split: float = 0.1,
    epochs: int = 3,
    batch_size: int = 2,
    learning_rate: float = 2e-4,
    lora_rank: int = 32,
    max_seq_length: int = 4096,
    save_gguf: bool = False,
 ) -> None:
    """Run the fine-tuning container.
@@ -53,12 +46,6 @@ def start_finetune(
        output_dir: Host path where the trained model will be saved.
        hf_cache: Host path to HuggingFace model cache (bind-mounted to avoid re-downloading).
        validation_split: Fraction of data held out for validation.
        epochs: Number of training epochs.
        batch_size: Per-device training batch size.
        learning_rate: Learning rate for the optimizer.
        lora_rank: LoRA adapter rank.
        max_seq_length: Maximum sequence length for training.
        save_gguf: Whether to also export a GGUF quantized model.
    """
    dataset_path = dataset_path.resolve()
    output_dir = output_dir.resolve()
@@ -91,30 +78,11 @@ def start_finetune(
        "/workspace/dataset.jsonl",
        "--output-dir",
        "/workspace/output/qwen-bill-summarizer",
        "--val-split",
        str(validation_split),
        "--epochs",
        str(epochs),
        "--batch-size",
        str(batch_size),
        "--lr",
        str(learning_rate),
        "--lora-rank",
        str(lora_rank),
        "--max-seq-length",
        str(max_seq_length),
    ]
    if save_gguf:
        command.append("--save-gguf")
    logger.info("Starting fine-tuning container")
    logger.info("  Dataset:    %s", dataset_path)
    logger.info("  Val split:  %.0f%%", validation_split * 100)
    logger.info("  Output:     %s", output_dir)
    logger.info("  Epochs:     %d", epochs)
    logger.info("  Batch size: %d", batch_size)
    logger.info("  LoRA rank:  %d", lora_rank)
    result = subprocess.run(command, text=True, check=False)
    if result.returncode != 0:
@@ -154,18 +122,13 @@ def build() -> None:
@app.command()
 def run(
-    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path("output/finetune_dataset.jsonl"),
+    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path(
        "/home/richie/dotfiles/data/finetune_dataset.jsonl"
    ),
    output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
-        "output/qwen-bill-summarizer",
+        "/home/richie/dotfiles/data/output/qwen-bill-summarizer",
    ),
    hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
    epochs: Annotated[int, typer.Option(help="Training epochs")] = 3,
    batch_size: Annotated[int, typer.Option(help="Per-device batch size")] = 2,
    learning_rate: Annotated[float, typer.Option("--lr", help="Learning rate")] = 2e-4,
    lora_rank: Annotated[int, typer.Option(help="LoRA rank")] = 32,
    max_seq_length: Annotated[int, typer.Option(help="Max sequence length")] = 4096,
    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run fine-tuning inside a Docker container."""
@@ -175,16 +138,8 @@ def run(
        dataset_path=dataset,
        output_dir=output_dir,
        hf_cache=hf_cache,
        validation_split=validation_split,
        epochs=epochs,
        batch_size=batch_size,
        learning_rate=learning_rate,
        lora_rank=lora_rank,
        max_seq_length=max_seq_length,
        save_gguf=save_gguf,
    )
@app.command()
 def stop() -> None:
    """Stop and remove the fine-tuning container."""
--- a/python/prompt_bench/containers/lib.py
+++ b/python/prompt_bench/containers/lib.py
@@ -0,0 +1,23 @@
 from __future__ import annotations
 import logging
 import subprocess
 logger = logging.getLogger(__name__)
 def check_gpu_free() -> None:
    """Warn if GPU-heavy processes (e.g. Ollama) are running."""
    result = subprocess.run(
        ["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        logger.warning("Could not query GPU processes: %s", result.stderr.strip())
        return
    processes = result.stdout.strip()
    if processes:
        logger.warning("GPU processes detected:\n%s", processes)
        logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")
--- a/python/prompt_bench/containers/vllm.py
+++ b/python/prompt_bench/containers/vllm.py
@@ -68,20 +68,3 @@ def stop_vllm() -> None:
        check=False,
    )
    logger.info("vLLM container stopped and removed")
 def check_gpu_free() -> None:
    """Warn if GPU-heavy processes (e.g. Ollama) are running."""
    result = subprocess.run(
        ["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        logger.warning("Could not query GPU processes: %s", result.stderr.strip())
        return
    processes = result.stdout.strip()
    if processes:
        logger.warning("GPU processes detected:\n%s", processes)
        logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")
--- a/python/prompt_bench/main.py
+++ b/python/prompt_bench/main.py
@@ -12,7 +12,8 @@ from typing import Annotated
 import typer
-from python.prompt_bench.vllm_container import check_gpu_free, start_vllm, stop_vllm
+from python.prompt_bench.containers.lib import check_gpu_free
 from python.prompt_bench.containers.vllm import start_vllm, stop_vllm
 from python.prompt_bench.downloader import is_model_present
 from python.prompt_bench.models import BenchmarkConfig
 from python.prompt_bench.vllm_client import VLLMClient
		`@@ -0,0 +1 @@`
							`"""Prompt benchmarking system for evaluating LLMs via vLLM."""`