created working finetuing pipeline

2026-04-17 21:18:18 -04:00 · 2026-04-10 12:56:57 -04:00
parent 921a397b1c
commit 721526022b
4 changed files with 470 additions and 0 deletions
--- a/python/prompt_bench/Dockerfile.finetune
+++ b/python/prompt_bench/Dockerfile.finetune
@@ -0,0 +1,25 @@
 # Unsloth fine-tuning container for Qwen 3.5 4B on RTX 3090.
 #
 # Build:
 #   docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
 #
 # Run:
 #   docker run --rm --device=nvidia.com/gpu=all --ipc=host \
 #     -v $(pwd)/output:/workspace/output \
 #     -v $(pwd)/output/finetune_dataset.jsonl:/workspace/dataset.jsonl:ro \
 #     -v /zfs/models/hf:/models \
 #     bill-finetune \
 #     --dataset /workspace/dataset.jsonl \
 #     --output-dir /workspace/output/qwen-bill-summarizer
 FROM ghcr.io/unslothai/unsloth:latest
 RUN pip install --no-cache-dir typer
 WORKDIR /workspace
 COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
 COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
 COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
 COPY python/__init__.py python/__init__.py
 ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
--- a/python/prompt_bench/finetune.py
+++ b/python/prompt_bench/finetune.py
@@ -0,0 +1,190 @@
 """Fine-tune Qwen 3.5 4B on bill summarization data using Unsloth.
 Loads a ChatML-style JSONL dataset (system/user/assistant messages),
 applies QLoRA with 4-bit quantization, and saves the merged model
 in HuggingFace format. Designed for a single RTX 3090 (24GB).
 Usage:
    python -m python.prompt_bench.finetune \
        --dataset output/finetune_dataset.jsonl \
        --output-dir output/qwen-bill-summarizer
 """
 from __future__ import annotations
 import json
 import logging
 from pathlib import Path
 from typing import Annotated
 import typer
 from unsloth import FastLanguageModel
 from datasets import Dataset
 from transformers import TrainingArguments
 from trl import SFTTrainer
 logger = logging.getLogger(__name__)
 BASE_MODEL = "unsloth/Qwen3-4B-Base-unsloth-bnb-4bit"
 # LoRA hyperparameters
 LORA_RANK = 32
 LORA_ALPHA = 32
 LORA_DROPOUT = 0.0
 LORA_TARGETS = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
 ]
 # Training hyperparameters tuned for ~2k examples on a 3090
 LEARNING_RATE = 2e-4
 EPOCHS = 3
 BATCH_SIZE = 2
 GRADIENT_ACCUMULATION = 8  # effective batch = 16
 MAX_SEQ_LENGTH = 4096
 WARMUP_RATIO = 0.05
 WEIGHT_DECAY = 0.01
 LOGGING_STEPS = 10
 SAVE_STEPS = 100
 def _messages_to_chatml(messages: list[dict]) -> str:
    r"""Convert a message list to Qwen ChatML format.
    Produces:
        <|im_start|>system\n...\n<|im_end|>
        <|im_start|>user\n...\n<|im_end|>
        <|im_start|>assistant\n...\n<|im_end|>
    """
    parts = []
    for message in messages:
        role = message["role"]
        content = message["content"]
        parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
    return "\n".join(parts)
 def load_dataset_from_jsonl(path: Path) -> Dataset:
    """Load a ChatML JSONL file into a HuggingFace Dataset.
    Each line must have {"messages": [{"role": ..., "content": ...}, ...]}.
    Pre-formats into a `text` column with the Qwen ChatML template applied,
    which SFTTrainer consumes directly.
    """
    records = []
    with path.open(encoding="utf-8") as handle:
        for raw_line in handle:
            stripped = raw_line.strip()
            if stripped:
                entry = json.loads(stripped)
                records.append({"text": _messages_to_chatml(entry["messages"])})
    logger.info("Loaded %d examples from %s", len(records), path)
    return Dataset.from_list(records)
 def main(
    dataset_path: Annotated[Path, typer.Option("--dataset", help="Fine-tuning JSONL")] = Path(
        "output/finetune_dataset.jsonl",
    ),
    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to save the merged model")] = Path(
        "output/qwen-bill-summarizer",
    ),
    base_model: Annotated[str, typer.Option("--base-model", help="Unsloth model ID")] = BASE_MODEL,
    epochs: Annotated[int, typer.Option("--epochs", help="Training epochs")] = EPOCHS,
    batch_size: Annotated[int, typer.Option("--batch-size", help="Per-device batch size")] = BATCH_SIZE,
    learning_rate: Annotated[float, typer.Option("--lr", help="Learning rate")] = LEARNING_RATE,
    lora_rank: Annotated[int, typer.Option("--lora-rank", help="LoRA rank")] = LORA_RANK,
    max_seq_length: Annotated[int, typer.Option("--max-seq-length", help="Max sequence length")] = MAX_SEQ_LENGTH,
    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
 ) -> None:
    """Fine-tune Qwen 3.5 4B on bill summarization with Unsloth + QLoRA."""
    logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    if not dataset_path.is_file():
        message = f"Dataset not found: {dataset_path}"
        raise typer.BadParameter(message)
    logger.info("Loading base model: %s", base_model)
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=base_model,
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        dtype=None,
    )
    logger.info("Applying LoRA (rank=%d, alpha=%d)", lora_rank, LORA_ALPHA)
    model = FastLanguageModel.get_peft_model(
        model,
        r=lora_rank,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=LORA_TARGETS,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=42,
    )
    full_dataset = load_dataset_from_jsonl(dataset_path)
    split = full_dataset.train_test_split(test_size=validation_split, seed=42)
    train_dataset = split["train"]
    validation_dataset = split["test"]
    logger.info("Split: %d train, %d validation", len(train_dataset), len(validation_dataset))
    training_args = TrainingArguments(
        output_dir=str(output_dir / "checkpoints"),
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
        learning_rate=learning_rate,
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        lr_scheduler_type="cosine",
        logging_steps=LOGGING_STEPS,
        save_steps=SAVE_STEPS,
        save_total_limit=3,
        eval_strategy="steps",
        eval_steps=SAVE_STEPS,
        load_best_model_at_end=True,
        bf16=True,
        optim="adamw_8bit",
        seed=42,
        report_to="none",
    )
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        args=training_args,
        max_seq_length=max_seq_length,
        packing=True,
    )
    logger.info("Starting training: %d train, %d val, %d epochs", len(train_dataset), len(validation_dataset), epochs)
    trainer.train()
    merged_path = str(output_dir / "merged")
    logger.info("Saving merged model to %s", merged_path)
    model.save_pretrained_merged(merged_path, tokenizer, save_method="merged_16bit")
    if save_gguf:
        gguf_path = str(output_dir / "gguf")
        logger.info("Saving GGUF to %s", gguf_path)
        model.save_pretrained_gguf(gguf_path, tokenizer, quantization_method="q4_k_m")
    logger.info("Done! Model saved to %s", output_dir)
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()
--- a/python/prompt_bench/finetune_container.py
+++ b/python/prompt_bench/finetune_container.py
@@ -0,0 +1,210 @@
 """Docker container lifecycle management for Unsloth fine-tuning."""
 from __future__ import annotations
 import logging
 import subprocess
 from pathlib import Path
 from typing import Annotated
 import typer
 from python.prompt_bench.container import check_gpu_free
 logger = logging.getLogger(__name__)
 CONTAINER_NAME = "bill-finetune"
 FINETUNE_IMAGE = "bill-finetune:latest"
 DOCKERFILE_PATH = "python/prompt_bench/Dockerfile.finetune"
 DEFAULT_HF_CACHE = Path("/zfs/models/hf")
 def build_image() -> None:
    """Build the fine-tuning Docker image."""
    logger.info("Building fine-tuning image: %s", FINETUNE_IMAGE)
    result = subprocess.run(
        ["docker", "build", "-f", DOCKERFILE_PATH, "-t", FINETUNE_IMAGE, "."],
        text=True,
        check=False,
    )
    if result.returncode != 0:
        message = "Failed to build fine-tuning image"
        raise RuntimeError(message)
    logger.info("Image built: %s", FINETUNE_IMAGE)
 def start_finetune(
    *,
    dataset_path: Path,
    output_dir: Path,
    hf_cache: Path = DEFAULT_HF_CACHE,
    validation_split: float = 0.1,
    epochs: int = 3,
    batch_size: int = 2,
    learning_rate: float = 2e-4,
    lora_rank: int = 32,
    max_seq_length: int = 4096,
    save_gguf: bool = False,
 ) -> None:
    """Run the fine-tuning container.
    Args:
        dataset_path: Host path to the fine-tuning JSONL dataset.
        output_dir: Host path where the trained model will be saved.
        hf_cache: Host path to HuggingFace model cache (bind-mounted to avoid re-downloading).
        validation_split: Fraction of data held out for validation.
        epochs: Number of training epochs.
        batch_size: Per-device training batch size.
        learning_rate: Learning rate for the optimizer.
        lora_rank: LoRA adapter rank.
        max_seq_length: Maximum sequence length for training.
        save_gguf: Whether to also export a GGUF quantized model.
    """
    dataset_path = dataset_path.resolve()
    output_dir = output_dir.resolve()
    if not dataset_path.is_file():
        message = f"Dataset not found: {dataset_path}"
        raise FileNotFoundError(message)
    output_dir.mkdir(parents=True, exist_ok=True)
    stop_finetune()
    hf_cache = hf_cache.resolve()
    hf_cache.mkdir(parents=True, exist_ok=True)
    command = [
        "docker",
        "run",
        "--name",
        CONTAINER_NAME,
        "--device=nvidia.com/gpu=all",
        "--ipc=host",
        "-v",
        f"{hf_cache}:/root/.cache/huggingface",
        "-v",
        f"{output_dir}:/workspace/output/qwen-bill-summarizer",
        "-v",
        f"{dataset_path}:/workspace/dataset.jsonl:ro",
        FINETUNE_IMAGE,
        "--dataset",
        "/workspace/dataset.jsonl",
        "--output-dir",
        "/workspace/output/qwen-bill-summarizer",
        "--val-split",
        str(validation_split),
        "--epochs",
        str(epochs),
        "--batch-size",
        str(batch_size),
        "--lr",
        str(learning_rate),
        "--lora-rank",
        str(lora_rank),
        "--max-seq-length",
        str(max_seq_length),
    ]
    if save_gguf:
        command.append("--save-gguf")
    logger.info("Starting fine-tuning container")
    logger.info("  Dataset:    %s", dataset_path)
    logger.info("  Val split:  %.0f%%", validation_split * 100)
    logger.info("  Output:     %s", output_dir)
    logger.info("  Epochs:     %d", epochs)
    logger.info("  Batch size: %d", batch_size)
    logger.info("  LoRA rank:  %d", lora_rank)
    result = subprocess.run(command, text=True, check=False)
    if result.returncode != 0:
        message = f"Fine-tuning container exited with code {result.returncode}"
        raise RuntimeError(message)
    logger.info("Fine-tuning complete. Model saved to %s", output_dir)
 def stop_finetune() -> None:
    """Stop and remove the fine-tuning container."""
    logger.info("Stopping fine-tuning container")
    subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
    subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
 def logs_finetune() -> str | None:
    """Return recent logs from the fine-tuning container, or None if not running."""
    result = subprocess.run(
        ["docker", "logs", "--tail", "50", CONTAINER_NAME],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        return None
    return result.stdout + result.stderr
 app = typer.Typer(help="Fine-tuning container management.")
@app.command()
 def build() -> None:
    """Build the fine-tuning Docker image."""
    build_image()
@app.command()
 def run(
    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path("output/finetune_dataset.jsonl"),
    output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
        "output/qwen-bill-summarizer",
    ),
    hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
    epochs: Annotated[int, typer.Option(help="Training epochs")] = 3,
    batch_size: Annotated[int, typer.Option(help="Per-device batch size")] = 2,
    learning_rate: Annotated[float, typer.Option("--lr", help="Learning rate")] = 2e-4,
    lora_rank: Annotated[int, typer.Option(help="LoRA rank")] = 32,
    max_seq_length: Annotated[int, typer.Option(help="Max sequence length")] = 4096,
    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run fine-tuning inside a Docker container."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    check_gpu_free()
    start_finetune(
        dataset_path=dataset,
        output_dir=output_dir,
        hf_cache=hf_cache,
        validation_split=validation_split,
        epochs=epochs,
        batch_size=batch_size,
        learning_rate=learning_rate,
        lora_rank=lora_rank,
        max_seq_length=max_seq_length,
        save_gguf=save_gguf,
    )
@app.command()
 def stop() -> None:
    """Stop and remove the fine-tuning container."""
    stop_finetune()
@app.command()
 def logs() -> None:
    """Show recent logs from the fine-tuning container."""
    output = logs_finetune()
    if output is None:
        typer.echo("No running fine-tuning container found.")
        raise typer.Exit(code=1)
    typer.echo(output)
 def cli() -> None:
    """Typer entry point."""
    app()
 if __name__ == "__main__":
    cli()
--- a/python/prompt_bench/train.sh
+++ b/python/prompt_bench/train.sh
@@ -0,0 +1,45 @@
 #!/usr/bin/env bash
 # Fine-tune Qwen 3.5 4B on bill summarization data.
 #
 # Prerequisites:
 #   1. Build the dataset:  python -m python.prompt_bench.build_finetune_dataset
 #   2. Build the image:    docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
 #
 # Usage:
 #   bash python/prompt_bench/train.sh [extra flags passed to finetune.py]
 #
 # Examples:
 #   bash python/prompt_bench/train.sh
 #   bash python/prompt_bench/train.sh --epochs 5 --lr 1e-4
 #   bash python/prompt_bench/train.sh --val-split 0.15 --save-gguf
 set -euo pipefail
 IMAGE="bill-finetune"
 DATASET="$(pwd)/output/finetune_dataset.jsonl"
 OUTPUT_DIR="$(pwd)/output/qwen-bill-summarizer"
 if [ ! -f "$DATASET" ]; then
    echo "Error: Dataset not found at $DATASET"
    echo "Run: python -m python.prompt_bench.build_finetune_dataset"
    exit 1
 fi
 mkdir -p "$OUTPUT_DIR"
 echo "Starting fine-tuning..."
 echo "  Dataset:    $DATASET"
 echo "  Output:     $OUTPUT_DIR"
 echo "  Extra args: $*"
 docker run --rm \
    --device=nvidia.com/gpu=all \
    --ipc=host \
    -v "$OUTPUT_DIR":/workspace/output/qwen-bill-summarizer \
    -v "$DATASET":/workspace/dataset.jsonl:ro \
    "$IMAGE" \
    --dataset /workspace/dataset.jsonl \
    --output-dir /workspace/output/qwen-bill-summarizer \
    "$@"
 echo "Done! Model saved to $OUTPUT_DIR"