From 721526022ba0598295138baa3c7bd0ee015c9d24 Mon Sep 17 00:00:00 2001
From: Richie Cahill <Richie@tmmworkshop.com>
Date: Fri, 10 Apr 2026 12:56:57 -0400
Subject: [PATCH] created working finetuing pipeline

---
 python/prompt_bench/Dockerfile.finetune   |  25 +++
 python/prompt_bench/finetune.py           | 190 ++++++++++++++++++++
 python/prompt_bench/finetune_container.py | 210 ++++++++++++++++++++++
 python/prompt_bench/train.sh              |  45 +++++
 4 files changed, 470 insertions(+)
 create mode 100644 python/prompt_bench/Dockerfile.finetune
 create mode 100644 python/prompt_bench/finetune.py
 create mode 100644 python/prompt_bench/finetune_container.py
 create mode 100644 python/prompt_bench/train.sh

diff --git a/python/prompt_bench/Dockerfile.finetune b/python/prompt_bench/Dockerfile.finetune
new file mode 100644
index 0000000..bed68dc
--- /dev/null
+++ b/python/prompt_bench/Dockerfile.finetune
@@ -0,0 +1,25 @@
+# Unsloth fine-tuning container for Qwen 3.5 4B on RTX 3090.
+#
+# Build:
+#   docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
+#
+# Run:
+#   docker run --rm --device=nvidia.com/gpu=all --ipc=host \
+#     -v $(pwd)/output:/workspace/output \
+#     -v $(pwd)/output/finetune_dataset.jsonl:/workspace/dataset.jsonl:ro \
+#     -v /zfs/models/hf:/models \
+#     bill-finetune \
+#     --dataset /workspace/dataset.jsonl \
+#     --output-dir /workspace/output/qwen-bill-summarizer
+
+FROM ghcr.io/unslothai/unsloth:latest
+
+RUN pip install --no-cache-dir typer
+
+WORKDIR /workspace
+COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
+COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
+COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
+COPY python/__init__.py python/__init__.py
+
+ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
diff --git a/python/prompt_bench/finetune.py b/python/prompt_bench/finetune.py
new file mode 100644
index 0000000..30ae3d3
--- /dev/null
+++ b/python/prompt_bench/finetune.py
@@ -0,0 +1,190 @@
+"""Fine-tune Qwen 3.5 4B on bill summarization data using Unsloth.
+
+Loads a ChatML-style JSONL dataset (system/user/assistant messages),
+applies QLoRA with 4-bit quantization, and saves the merged model
+in HuggingFace format. Designed for a single RTX 3090 (24GB).
+
+Usage:
+    python -m python.prompt_bench.finetune \
+        --dataset output/finetune_dataset.jsonl \
+        --output-dir output/qwen-bill-summarizer
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Annotated
+
+import typer
+from unsloth import FastLanguageModel
+from datasets import Dataset
+from transformers import TrainingArguments
+from trl import SFTTrainer
+
+logger = logging.getLogger(__name__)
+
+BASE_MODEL = "unsloth/Qwen3-4B-Base-unsloth-bnb-4bit"
+
+# LoRA hyperparameters
+LORA_RANK = 32
+LORA_ALPHA = 32
+LORA_DROPOUT = 0.0
+LORA_TARGETS = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+
+# Training hyperparameters tuned for ~2k examples on a 3090
+LEARNING_RATE = 2e-4
+EPOCHS = 3
+BATCH_SIZE = 2
+GRADIENT_ACCUMULATION = 8  # effective batch = 16
+MAX_SEQ_LENGTH = 4096
+WARMUP_RATIO = 0.05
+WEIGHT_DECAY = 0.01
+LOGGING_STEPS = 10
+SAVE_STEPS = 100
+
+
+def _messages_to_chatml(messages: list[dict]) -> str:
+    r"""Convert a message list to Qwen ChatML format.
+
+    Produces:
+        <|im_start|>system\n...\n<|im_end|>
+        <|im_start|>user\n...\n<|im_end|>
+        <|im_start|>assistant\n...\n<|im_end|>
+    """
+    parts = []
+    for message in messages:
+        role = message["role"]
+        content = message["content"]
+        parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
+    return "\n".join(parts)
+
+
+def load_dataset_from_jsonl(path: Path) -> Dataset:
+    """Load a ChatML JSONL file into a HuggingFace Dataset.
+
+    Each line must have {"messages": [{"role": ..., "content": ...}, ...]}.
+    Pre-formats into a `text` column with the Qwen ChatML template applied,
+    which SFTTrainer consumes directly.
+    """
+    records = []
+    with path.open(encoding="utf-8") as handle:
+        for raw_line in handle:
+            stripped = raw_line.strip()
+            if stripped:
+                entry = json.loads(stripped)
+                records.append({"text": _messages_to_chatml(entry["messages"])})
+    logger.info("Loaded %d examples from %s", len(records), path)
+    return Dataset.from_list(records)
+
+
+def main(
+    dataset_path: Annotated[Path, typer.Option("--dataset", help="Fine-tuning JSONL")] = Path(
+        "output/finetune_dataset.jsonl",
+    ),
+    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
+    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to save the merged model")] = Path(
+        "output/qwen-bill-summarizer",
+    ),
+    base_model: Annotated[str, typer.Option("--base-model", help="Unsloth model ID")] = BASE_MODEL,
+    epochs: Annotated[int, typer.Option("--epochs", help="Training epochs")] = EPOCHS,
+    batch_size: Annotated[int, typer.Option("--batch-size", help="Per-device batch size")] = BATCH_SIZE,
+    learning_rate: Annotated[float, typer.Option("--lr", help="Learning rate")] = LEARNING_RATE,
+    lora_rank: Annotated[int, typer.Option("--lora-rank", help="LoRA rank")] = LORA_RANK,
+    max_seq_length: Annotated[int, typer.Option("--max-seq-length", help="Max sequence length")] = MAX_SEQ_LENGTH,
+    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
+) -> None:
+    """Fine-tune Qwen 3.5 4B on bill summarization with Unsloth + QLoRA."""
+    logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+
+    if not dataset_path.is_file():
+        message = f"Dataset not found: {dataset_path}"
+        raise typer.BadParameter(message)
+
+    logger.info("Loading base model: %s", base_model)
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=base_model,
+        max_seq_length=max_seq_length,
+        load_in_4bit=True,
+        dtype=None,
+    )
+
+    logger.info("Applying LoRA (rank=%d, alpha=%d)", lora_rank, LORA_ALPHA)
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=lora_rank,
+        lora_alpha=LORA_ALPHA,
+        lora_dropout=LORA_DROPOUT,
+        target_modules=LORA_TARGETS,
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=42,
+    )
+
+    full_dataset = load_dataset_from_jsonl(dataset_path)
+    split = full_dataset.train_test_split(test_size=validation_split, seed=42)
+    train_dataset = split["train"]
+    validation_dataset = split["test"]
+    logger.info("Split: %d train, %d validation", len(train_dataset), len(validation_dataset))
+    training_args = TrainingArguments(
+        output_dir=str(output_dir / "checkpoints"),
+        num_train_epochs=epochs,
+        per_device_train_batch_size=batch_size,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
+        learning_rate=learning_rate,
+        warmup_ratio=WARMUP_RATIO,
+        weight_decay=WEIGHT_DECAY,
+        lr_scheduler_type="cosine",
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=3,
+        eval_strategy="steps",
+        eval_steps=SAVE_STEPS,
+        load_best_model_at_end=True,
+        bf16=True,
+        optim="adamw_8bit",
+        seed=42,
+        report_to="none",
+    )
+
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=train_dataset,
+        eval_dataset=validation_dataset,
+        args=training_args,
+        max_seq_length=max_seq_length,
+        packing=True,
+    )
+
+    logger.info("Starting training: %d train, %d val, %d epochs", len(train_dataset), len(validation_dataset), epochs)
+    trainer.train()
+
+    merged_path = str(output_dir / "merged")
+    logger.info("Saving merged model to %s", merged_path)
+    model.save_pretrained_merged(merged_path, tokenizer, save_method="merged_16bit")
+
+    if save_gguf:
+        gguf_path = str(output_dir / "gguf")
+        logger.info("Saving GGUF to %s", gguf_path)
+        model.save_pretrained_gguf(gguf_path, tokenizer, quantization_method="q4_k_m")
+
+    logger.info("Done! Model saved to %s", output_dir)
+
+
+def cli() -> None:
+    """Typer entry point."""
+    typer.run(main)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/python/prompt_bench/finetune_container.py b/python/prompt_bench/finetune_container.py
new file mode 100644
index 0000000..5b8cb8e
--- /dev/null
+++ b/python/prompt_bench/finetune_container.py
@@ -0,0 +1,210 @@
+"""Docker container lifecycle management for Unsloth fine-tuning."""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from pathlib import Path
+from typing import Annotated
+
+import typer
+
+from python.prompt_bench.container import check_gpu_free
+
+logger = logging.getLogger(__name__)
+
+CONTAINER_NAME = "bill-finetune"
+FINETUNE_IMAGE = "bill-finetune:latest"
+DOCKERFILE_PATH = "python/prompt_bench/Dockerfile.finetune"
+DEFAULT_HF_CACHE = Path("/zfs/models/hf")
+
+
+def build_image() -> None:
+    """Build the fine-tuning Docker image."""
+    logger.info("Building fine-tuning image: %s", FINETUNE_IMAGE)
+    result = subprocess.run(
+        ["docker", "build", "-f", DOCKERFILE_PATH, "-t", FINETUNE_IMAGE, "."],
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        message = "Failed to build fine-tuning image"
+        raise RuntimeError(message)
+    logger.info("Image built: %s", FINETUNE_IMAGE)
+
+
+def start_finetune(
+    *,
+    dataset_path: Path,
+    output_dir: Path,
+    hf_cache: Path = DEFAULT_HF_CACHE,
+    validation_split: float = 0.1,
+    epochs: int = 3,
+    batch_size: int = 2,
+    learning_rate: float = 2e-4,
+    lora_rank: int = 32,
+    max_seq_length: int = 4096,
+    save_gguf: bool = False,
+) -> None:
+    """Run the fine-tuning container.
+
+    Args:
+        dataset_path: Host path to the fine-tuning JSONL dataset.
+        output_dir: Host path where the trained model will be saved.
+        hf_cache: Host path to HuggingFace model cache (bind-mounted to avoid re-downloading).
+        validation_split: Fraction of data held out for validation.
+        epochs: Number of training epochs.
+        batch_size: Per-device training batch size.
+        learning_rate: Learning rate for the optimizer.
+        lora_rank: LoRA adapter rank.
+        max_seq_length: Maximum sequence length for training.
+        save_gguf: Whether to also export a GGUF quantized model.
+    """
+    dataset_path = dataset_path.resolve()
+    output_dir = output_dir.resolve()
+
+    if not dataset_path.is_file():
+        message = f"Dataset not found: {dataset_path}"
+        raise FileNotFoundError(message)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    stop_finetune()
+
+    hf_cache = hf_cache.resolve()
+    hf_cache.mkdir(parents=True, exist_ok=True)
+
+    command = [
+        "docker",
+        "run",
+        "--name",
+        CONTAINER_NAME,
+        "--device=nvidia.com/gpu=all",
+        "--ipc=host",
+        "-v",
+        f"{hf_cache}:/root/.cache/huggingface",
+        "-v",
+        f"{output_dir}:/workspace/output/qwen-bill-summarizer",
+        "-v",
+        f"{dataset_path}:/workspace/dataset.jsonl:ro",
+        FINETUNE_IMAGE,
+        "--dataset",
+        "/workspace/dataset.jsonl",
+        "--output-dir",
+        "/workspace/output/qwen-bill-summarizer",
+        "--val-split",
+        str(validation_split),
+        "--epochs",
+        str(epochs),
+        "--batch-size",
+        str(batch_size),
+        "--lr",
+        str(learning_rate),
+        "--lora-rank",
+        str(lora_rank),
+        "--max-seq-length",
+        str(max_seq_length),
+    ]
+
+    if save_gguf:
+        command.append("--save-gguf")
+
+    logger.info("Starting fine-tuning container")
+    logger.info("  Dataset:    %s", dataset_path)
+    logger.info("  Val split:  %.0f%%", validation_split * 100)
+    logger.info("  Output:     %s", output_dir)
+    logger.info("  Epochs:     %d", epochs)
+    logger.info("  Batch size: %d", batch_size)
+    logger.info("  LoRA rank:  %d", lora_rank)
+
+    result = subprocess.run(command, text=True, check=False)
+    if result.returncode != 0:
+        message = f"Fine-tuning container exited with code {result.returncode}"
+        raise RuntimeError(message)
+    logger.info("Fine-tuning complete. Model saved to %s", output_dir)
+
+
+def stop_finetune() -> None:
+    """Stop and remove the fine-tuning container."""
+    logger.info("Stopping fine-tuning container")
+    subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
+    subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
+
+
+def logs_finetune() -> str | None:
+    """Return recent logs from the fine-tuning container, or None if not running."""
+    result = subprocess.run(
+        ["docker", "logs", "--tail", "50", CONTAINER_NAME],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        return None
+    return result.stdout + result.stderr
+
+
+app = typer.Typer(help="Fine-tuning container management.")
+
+
+@app.command()
+def build() -> None:
+    """Build the fine-tuning Docker image."""
+    build_image()
+
+
+@app.command()
+def run(
+    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path("output/finetune_dataset.jsonl"),
+    output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
+        "output/qwen-bill-summarizer",
+    ),
+    hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
+    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
+    epochs: Annotated[int, typer.Option(help="Training epochs")] = 3,
+    batch_size: Annotated[int, typer.Option(help="Per-device batch size")] = 2,
+    learning_rate: Annotated[float, typer.Option("--lr", help="Learning rate")] = 2e-4,
+    lora_rank: Annotated[int, typer.Option(help="LoRA rank")] = 32,
+    max_seq_length: Annotated[int, typer.Option(help="Max sequence length")] = 4096,
+    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
+    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
+) -> None:
+    """Run fine-tuning inside a Docker container."""
+    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+    check_gpu_free()
+    start_finetune(
+        dataset_path=dataset,
+        output_dir=output_dir,
+        hf_cache=hf_cache,
+        validation_split=validation_split,
+        epochs=epochs,
+        batch_size=batch_size,
+        learning_rate=learning_rate,
+        lora_rank=lora_rank,
+        max_seq_length=max_seq_length,
+        save_gguf=save_gguf,
+    )
+
+
+@app.command()
+def stop() -> None:
+    """Stop and remove the fine-tuning container."""
+    stop_finetune()
+
+
+@app.command()
+def logs() -> None:
+    """Show recent logs from the fine-tuning container."""
+    output = logs_finetune()
+    if output is None:
+        typer.echo("No running fine-tuning container found.")
+        raise typer.Exit(code=1)
+    typer.echo(output)
+
+
+def cli() -> None:
+    """Typer entry point."""
+    app()
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/python/prompt_bench/train.sh b/python/prompt_bench/train.sh
new file mode 100644
index 0000000..504cb30
--- /dev/null
+++ b/python/prompt_bench/train.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Fine-tune Qwen 3.5 4B on bill summarization data.
+#
+# Prerequisites:
+#   1. Build the dataset:  python -m python.prompt_bench.build_finetune_dataset
+#   2. Build the image:    docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
+#
+# Usage:
+#   bash python/prompt_bench/train.sh [extra flags passed to finetune.py]
+#
+# Examples:
+#   bash python/prompt_bench/train.sh
+#   bash python/prompt_bench/train.sh --epochs 5 --lr 1e-4
+#   bash python/prompt_bench/train.sh --val-split 0.15 --save-gguf
+
+set -euo pipefail
+
+IMAGE="bill-finetune"
+DATASET="$(pwd)/output/finetune_dataset.jsonl"
+OUTPUT_DIR="$(pwd)/output/qwen-bill-summarizer"
+
+if [ ! -f "$DATASET" ]; then
+    echo "Error: Dataset not found at $DATASET"
+    echo "Run: python -m python.prompt_bench.build_finetune_dataset"
+    exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+echo "Starting fine-tuning..."
+echo "  Dataset:    $DATASET"
+echo "  Output:     $OUTPUT_DIR"
+echo "  Extra args: $*"
+
+docker run --rm \
+    --device=nvidia.com/gpu=all \
+    --ipc=host \
+    -v "$OUTPUT_DIR":/workspace/output/qwen-bill-summarizer \
+    -v "$DATASET":/workspace/dataset.jsonl:ro \
+    "$IMAGE" \
+    --dataset /workspace/dataset.jsonl \
+    --output-dir /workspace/output/qwen-bill-summarizer \
+    "$@"
+
+echo "Done! Model saved to $OUTPUT_DIR"