created working finetuing pipeline

This commit is contained in:
2026-04-10 12:56:57 -04:00
parent 921a397b1c
commit 721526022b
4 changed files with 470 additions and 0 deletions

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env bash
# Fine-tune Qwen 3.5 4B on bill summarization data.
#
# Prerequisites:
# 1. Build the dataset: python -m python.prompt_bench.build_finetune_dataset
# 2. Build the image: docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
#
# Usage:
# bash python/prompt_bench/train.sh [extra flags passed to finetune.py]
#
# Examples:
# bash python/prompt_bench/train.sh
# bash python/prompt_bench/train.sh --epochs 5 --lr 1e-4
# bash python/prompt_bench/train.sh --val-split 0.15 --save-gguf
set -euo pipefail
IMAGE="bill-finetune"
DATASET="$(pwd)/output/finetune_dataset.jsonl"
OUTPUT_DIR="$(pwd)/output/qwen-bill-summarizer"
if [ ! -f "$DATASET" ]; then
echo "Error: Dataset not found at $DATASET"
echo "Run: python -m python.prompt_bench.build_finetune_dataset"
exit 1
fi
mkdir -p "$OUTPUT_DIR"
echo "Starting fine-tuning..."
echo " Dataset: $DATASET"
echo " Output: $OUTPUT_DIR"
echo " Extra args: $*"
docker run --rm \
--device=nvidia.com/gpu=all \
--ipc=host \
-v "$OUTPUT_DIR":/workspace/output/qwen-bill-summarizer \
-v "$DATASET":/workspace/dataset.jsonl:ro \
"$IMAGE" \
--dataset /workspace/dataset.jsonl \
--output-dir /workspace/output/qwen-bill-summarizer \
"$@"
echo "Done! Model saved to $OUTPUT_DIR"