moved prompt_bench
This commit is contained in:
@@ -22,4 +22,4 @@ COPY config/prompts/summarization_prompts.toml config/prompts/summarization_prom
|
||||
COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
|
||||
COPY python/__init__.py python/__init__.py
|
||||
|
||||
ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
|
||||
ENTRYPOINT ["python", "-m", "pipelines.prompt_bench.finetune"]
|
||||
@@ -23,9 +23,14 @@ import httpx
|
||||
import typer
|
||||
from tiktoken import Encoding, get_encoding
|
||||
|
||||
from python.prompt_bench.bill_token_compression import compress_bill_text
|
||||
from pipelines.prompt_bench.bill_token_compression import compress_bill_text
|
||||
|
||||
_PROMPTS_PATH = Path(__file__).resolve().parents[2] / "config" / "prompts" / "summarization_prompts.toml"
|
||||
_PROMPTS_PATH = (
|
||||
Path(__file__).resolve().parents[2]
|
||||
/ "config"
|
||||
/ "prompts"
|
||||
/ "summarization_prompts.toml"
|
||||
)
|
||||
_PROMPTS = tomllib.loads(_PROMPTS_PATH.read_text())["summarization"]
|
||||
SUMMARIZATION_SYSTEM_PROMPT: str = _PROMPTS["system_prompt"]
|
||||
SUMMARIZATION_USER_TEMPLATE: str = _PROMPTS["user_template"]
|
||||
@@ -72,7 +77,12 @@ def build_request(custom_id: str, model: str, bill_text: str) -> dict:
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
|
||||
{
|
||||
"role": "user",
|
||||
"content": SUMMARIZATION_USER_TEMPLATE.format(
|
||||
text_content=bill_text
|
||||
),
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
@@ -123,7 +133,9 @@ def prepare_requests(
|
||||
"compressed_chars": len(compressed_text),
|
||||
"raw_tokens": raw_token_count,
|
||||
"compressed_tokens": compressed_token_count,
|
||||
"token_ratio": (compressed_token_count / raw_token_count) if raw_token_count else None,
|
||||
"token_ratio": (compressed_token_count / raw_token_count)
|
||||
if raw_token_count
|
||||
else None,
|
||||
},
|
||||
)
|
||||
safe_id = safe_filename(bill_id)
|
||||
@@ -136,7 +148,14 @@ def write_token_csv(path: Path, token_rows: list[dict]) -> tuple[int, int]:
|
||||
with path.open("w", newline="", encoding="utf-8") as handle:
|
||||
writer = csv.DictWriter(
|
||||
handle,
|
||||
fieldnames=["bill_id", "raw_chars", "compressed_chars", "raw_tokens", "compressed_tokens", "token_ratio"],
|
||||
fieldnames=[
|
||||
"bill_id",
|
||||
"raw_chars",
|
||||
"compressed_chars",
|
||||
"raw_tokens",
|
||||
"compressed_tokens",
|
||||
"token_ratio",
|
||||
],
|
||||
)
|
||||
writer.writeheader()
|
||||
writer.writerows(token_rows)
|
||||
@@ -161,8 +180,12 @@ def create_batch(client: httpx.Client, input_file_id: str, description: str) ->
|
||||
|
||||
|
||||
def main(
|
||||
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
|
||||
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")] = Path(
|
||||
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path(
|
||||
"bills.csv"
|
||||
),
|
||||
output_dir: Annotated[
|
||||
Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")
|
||||
] = Path(
|
||||
"output/openai_batch",
|
||||
),
|
||||
model: Annotated[str, typer.Option(help="OpenAI model id")] = "gpt-5-mini",
|
||||
@@ -170,7 +193,9 @@ def main(
|
||||
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
|
||||
) -> None:
|
||||
"""Submit an OpenAI Batch job of compressed bill summaries."""
|
||||
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
logging.basicConfig(
|
||||
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
|
||||
)
|
||||
|
||||
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
@@ -191,7 +216,9 @@ def main(
|
||||
request_lines, token_rows = prepare_requests(bills, model=model, encoder=encoder)
|
||||
|
||||
token_csv_path = output_dir / "token_counts.csv"
|
||||
raw_tokens_total, compressed_tokens_total = write_token_csv(token_csv_path, token_rows)
|
||||
raw_tokens_total, compressed_tokens_total = write_token_csv(
|
||||
token_csv_path, token_rows
|
||||
)
|
||||
logger.info(
|
||||
"Token counts: raw=%d compressed=%d ratio=%.3f -> %s",
|
||||
raw_tokens_total,
|
||||
@@ -211,7 +238,11 @@ def main(
|
||||
logger.info("Uploaded: %s", file_id)
|
||||
|
||||
logger.info("Creating batch")
|
||||
batch = create_batch(client, file_id, f"compressed bill summaries x{len(request_lines)} ({model})")
|
||||
batch = create_batch(
|
||||
client,
|
||||
file_id,
|
||||
f"compressed bill summaries x{len(request_lines)} ({model})",
|
||||
)
|
||||
logger.info("Batch created: %s", batch["id"])
|
||||
|
||||
metadata = {
|
||||
@@ -24,9 +24,14 @@ from typing import Annotated
|
||||
import httpx
|
||||
import typer
|
||||
|
||||
from python.prompt_bench.bill_token_compression import compress_bill_text
|
||||
from pipelines.prompt_bench.bill_token_compression import compress_bill_text
|
||||
|
||||
_PROMPTS_PATH = Path(__file__).resolve().parents[2] / "config" / "prompts" / "summarization_prompts.toml"
|
||||
_PROMPTS_PATH = (
|
||||
Path(__file__).resolve().parents[2]
|
||||
/ "config"
|
||||
/ "prompts"
|
||||
/ "summarization_prompts.toml"
|
||||
)
|
||||
_PROMPTS = tomllib.loads(_PROMPTS_PATH.read_text())["summarization"]
|
||||
SUMMARIZATION_SYSTEM_PROMPT: str = _PROMPTS["system_prompt"]
|
||||
SUMMARIZATION_USER_TEMPLATE: str = _PROMPTS["user_template"]
|
||||
@@ -62,7 +67,10 @@ def build_messages(bill_text: str) -> list[dict]:
|
||||
"""Return the system + user message pair for a bill."""
|
||||
return [
|
||||
{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
|
||||
{
|
||||
"role": "user",
|
||||
"content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -132,17 +140,25 @@ def run_one_request(
|
||||
|
||||
|
||||
def main(
|
||||
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
|
||||
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
|
||||
csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path(
|
||||
"bills.csv"
|
||||
),
|
||||
output_dir: Annotated[
|
||||
Path, typer.Option("--output-dir", help="Where to write per-request JSON")
|
||||
] = Path(
|
||||
"output/openai_runs",
|
||||
),
|
||||
model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
|
||||
count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
|
||||
concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
|
||||
concurrency: Annotated[
|
||||
int, typer.Option(help="Concurrent in-flight requests")
|
||||
] = 16,
|
||||
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
|
||||
) -> None:
|
||||
"""Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
|
||||
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
logging.basicConfig(
|
||||
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
|
||||
)
|
||||
|
||||
api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
@@ -165,8 +181,17 @@ def main(
|
||||
tasks: list[tuple[str, str, str, Path]] = []
|
||||
for bill_id, text_content in bills:
|
||||
filename = f"{safe_filename(bill_id)}.json"
|
||||
tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
|
||||
tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
|
||||
tasks.append(
|
||||
(
|
||||
bill_id,
|
||||
"compressed",
|
||||
compress_bill_text(text_content),
|
||||
compressed_dir / filename,
|
||||
)
|
||||
)
|
||||
tasks.append(
|
||||
(bill_id, "uncompressed", text_content, uncompressed_dir / filename)
|
||||
)
|
||||
|
||||
logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)
|
||||
|
||||
@@ -9,7 +9,7 @@ from typing import Annotated
|
||||
|
||||
import typer
|
||||
|
||||
from python.prompt_bench.containers.lib import check_gpu_free
|
||||
from pipelines.prompt_bench.containers.lib import check_gpu_free
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -95,7 +95,9 @@ def stop_finetune() -> None:
|
||||
"""Stop and remove the fine-tuning container."""
|
||||
logger.info("Stopping fine-tuning container")
|
||||
subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
|
||||
subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
|
||||
subprocess.run(
|
||||
["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False
|
||||
)
|
||||
|
||||
|
||||
def logs_finetune() -> str | None:
|
||||
@@ -125,14 +127,20 @@ def run(
|
||||
dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path(
|
||||
"/home/richie/dotfiles/data/finetune_dataset.jsonl"
|
||||
),
|
||||
output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
|
||||
output_dir: Annotated[
|
||||
Path, typer.Option(help="Where to save the trained model")
|
||||
] = Path(
|
||||
"/home/richie/dotfiles/data/output/qwen-bill-summarizer",
|
||||
),
|
||||
hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
|
||||
hf_cache: Annotated[
|
||||
Path, typer.Option(help="Host path to HuggingFace model cache")
|
||||
] = DEFAULT_HF_CACHE,
|
||||
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
|
||||
) -> None:
|
||||
"""Run fine-tuning inside a Docker container."""
|
||||
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
logging.basicConfig(
|
||||
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
|
||||
)
|
||||
check_gpu_free()
|
||||
start_finetune(
|
||||
dataset_path=dataset,
|
||||
@@ -140,6 +148,7 @@ def run(
|
||||
hf_cache=hf_cache,
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def stop() -> None:
|
||||
"""Stop and remove the fine-tuning container."""
|
||||
@@ -9,7 +9,7 @@ from typing import Annotated
|
||||
import typer
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from python.prompt_bench.models import BenchmarkConfig
|
||||
from pipelines.prompt_bench.models import BenchmarkConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -52,11 +52,15 @@ def download_all(config: BenchmarkConfig) -> None:
|
||||
|
||||
|
||||
def main(
|
||||
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
|
||||
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path(
|
||||
"bench.toml"
|
||||
),
|
||||
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
|
||||
) -> None:
|
||||
"""Download all models listed in the benchmark config."""
|
||||
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
logging.basicConfig(
|
||||
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
|
||||
)
|
||||
|
||||
if not config.is_file():
|
||||
message = f"Config file does not exist: {config}"
|
||||
@@ -5,7 +5,7 @@ applies QLoRA with 4-bit quantization, and saves the merged model
|
||||
in HuggingFace format. Designed for a single RTX 3090 (24GB).
|
||||
|
||||
Usage:
|
||||
python -m python.prompt_bench.finetune \
|
||||
python -m pipelines.prompt_bench.finetune \
|
||||
--dataset output/finetune_dataset.jsonl \
|
||||
--output-dir output/qwen-bill-summarizer
|
||||
"""
|
||||
@@ -107,21 +107,31 @@ def load_dataset_from_jsonl(path: Path) -> Dataset:
|
||||
|
||||
|
||||
def main(
|
||||
dataset_path: Annotated[Path, typer.Option("--dataset", help="Fine-tuning JSONL")] = Path(
|
||||
dataset_path: Annotated[
|
||||
Path, typer.Option("--dataset", help="Fine-tuning JSONL")
|
||||
] = Path(
|
||||
"output/finetune_dataset.jsonl",
|
||||
),
|
||||
validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
|
||||
output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to save the merged model")] = Path(
|
||||
validation_split: Annotated[
|
||||
float, typer.Option("--val-split", help="Fraction held out for validation")
|
||||
] = 0.1,
|
||||
output_dir: Annotated[
|
||||
Path, typer.Option("--output-dir", help="Where to save the merged model")
|
||||
] = Path(
|
||||
"output/qwen-bill-summarizer",
|
||||
),
|
||||
config_path: Annotated[
|
||||
Path,
|
||||
typer.Option("--config", help="TOML config file"),
|
||||
] = Path(__file__).parent / "config.toml",
|
||||
save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
|
||||
save_gguf: Annotated[
|
||||
bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")
|
||||
] = False,
|
||||
) -> None:
|
||||
"""Fine-tune Qwen 3.5 4B on bill summarization with Unsloth + QLoRA."""
|
||||
logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
logging.basicConfig(
|
||||
level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s"
|
||||
)
|
||||
|
||||
if not dataset_path.is_file():
|
||||
message = f"Dataset not found: {dataset_path}"
|
||||
@@ -137,7 +147,9 @@ def main(
|
||||
dtype=None,
|
||||
)
|
||||
|
||||
logger.info("Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha)
|
||||
logger.info(
|
||||
"Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha
|
||||
)
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=config.lora.rank,
|
||||
@@ -153,7 +165,9 @@ def main(
|
||||
split = full_dataset.train_test_split(test_size=validation_split, seed=42)
|
||||
train_dataset = split["train"]
|
||||
validation_dataset = split["test"]
|
||||
logger.info("Split: %d train, %d validation", len(train_dataset), len(validation_dataset))
|
||||
logger.info(
|
||||
"Split: %d train, %d validation", len(train_dataset), len(validation_dataset)
|
||||
)
|
||||
training_args = TrainingArguments(
|
||||
output_dir=str(output_dir / "checkpoints"),
|
||||
num_train_epochs=config.training.epochs,
|
||||
@@ -11,11 +11,11 @@ from typing import Annotated
|
||||
|
||||
import typer
|
||||
|
||||
from python.prompt_bench.containers.lib import check_gpu_free
|
||||
from python.prompt_bench.containers.vllm import start_vllm, stop_vllm
|
||||
from python.prompt_bench.downloader import is_model_present
|
||||
from python.prompt_bench.models import BenchmarkConfig
|
||||
from python.prompt_bench.vllm_client import VLLMClient
|
||||
from pipelines.prompt_bench.containers.lib import check_gpu_free
|
||||
from pipelines.prompt_bench.containers.vllm import start_vllm, stop_vllm
|
||||
from pipelines.prompt_bench.downloader import is_model_present
|
||||
from pipelines.prompt_bench.models import BenchmarkConfig
|
||||
from pipelines.prompt_bench.vllm_client import VLLMClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -72,7 +72,9 @@ def benchmark_model(
|
||||
vLLM batches concurrent requests internally, so submitting many at once is
|
||||
significantly faster than running them serially.
|
||||
"""
|
||||
pending = [prompt for prompt in prompts if not (model_output / prompt.name).exists()]
|
||||
pending = [
|
||||
prompt for prompt in prompts if not (model_output / prompt.name).exists()
|
||||
]
|
||||
skipped = len(prompts) - len(pending)
|
||||
if skipped:
|
||||
logger.info("Skipping %d prompts with existing output for %s", skipped, repo)
|
||||
@@ -185,13 +187,21 @@ def run_benchmark(
|
||||
|
||||
|
||||
def main(
|
||||
input_dir: Annotated[Path, typer.Argument(help="Directory containing input .txt prompt files")],
|
||||
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
|
||||
output_dir: Annotated[Path, typer.Option(help="Output directory for results")] = Path("output"),
|
||||
input_dir: Annotated[
|
||||
Path, typer.Argument(help="Directory containing input .txt prompt files")
|
||||
],
|
||||
config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path(
|
||||
"bench.toml"
|
||||
),
|
||||
output_dir: Annotated[
|
||||
Path, typer.Option(help="Output directory for results")
|
||||
] = Path("output"),
|
||||
log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
|
||||
) -> None:
|
||||
"""Run prompts through multiple LLMs via vLLM and save results."""
|
||||
logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
logging.basicConfig(
|
||||
level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
|
||||
)
|
||||
|
||||
if not input_dir.is_dir():
|
||||
message = f"Input directory does not exist: {input_dir}"
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1 +0,0 @@
|
||||
how many oceans are there in the world
|
||||
@@ -1 +0,0 @@
|
||||
whos the president of the united states
|
||||
@@ -1 +0,0 @@
|
||||
whats the greatest country in the world
|
||||
@@ -1 +0,0 @@
|
||||
was/is the usa the greatest country in the world
|
||||
Reference in New Issue
Block a user