feat: add integrated benchmarking system
This commit is contained in:
@@ -3,9 +3,11 @@
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.linalg as LA
|
||||
import torch.nn.functional as F
|
||||
from numpy.typing import NDArray
|
||||
from rich.progress import track
|
||||
from rich.table import Table
|
||||
from torch import Tensor
|
||||
@@ -156,11 +158,9 @@ class Analyzer:
|
||||
try:
|
||||
import imageio.v3 as iio # ty:ignore[unresolved-import]
|
||||
import matplotlib.pyplot as plt # ty:ignore[unresolved-import]
|
||||
import numpy as np # ty:ignore[unresolved-import]
|
||||
from geom_median.numpy import ( # ty:ignore[unresolved-import]
|
||||
compute_geometric_median,
|
||||
)
|
||||
from numpy.typing import NDArray # ty:ignore[unresolved-import]
|
||||
from pacmap import PaCMAP # ty:ignore[unresolved-import]
|
||||
except ImportError:
|
||||
print()
|
||||
|
||||
@@ -61,6 +61,18 @@ class DatasetSpecification(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
class BenchmarkSpecification(BaseModel):
|
||||
task: str = Field(
|
||||
description="Task ID of the benchmark in the Language Model Evaluation Harness."
|
||||
)
|
||||
|
||||
name: str = Field(description="Name of the benchmark for presentation purposes.")
|
||||
|
||||
description: str = Field(
|
||||
description="Description of the benchmark for presentation purposes."
|
||||
)
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
model: str = Field(description="Hugging Face model ID, or path to model on disk.")
|
||||
|
||||
@@ -230,6 +242,67 @@ class Settings(BaseSettings):
|
||||
description="Directory to save and load study progress to/from.",
|
||||
)
|
||||
|
||||
benchmarks: list[BenchmarkSpecification] = Field(
|
||||
default=[
|
||||
BenchmarkSpecification(
|
||||
task="agieval",
|
||||
name="AGIEval",
|
||||
description="A Human-Centric Benchmark for Evaluating Foundation Models",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="bbh",
|
||||
name="BIG-Bench Hard (BBH)",
|
||||
description="Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="commonsense_qa",
|
||||
name="CommonsenseQA",
|
||||
description="A Question Answering Challenge Targeting Commonsense Knowledge",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="eq_bench",
|
||||
name="EQ-Bench",
|
||||
description="An Emotional Intelligence Benchmark for Large Language Models",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="gsm8k",
|
||||
name="GSM8K",
|
||||
description="Training Verifiers to Solve Math Word Problems",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="hellaswag",
|
||||
name="HellaSwag",
|
||||
description="Can a Machine Really Finish Your Sentence?",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="ifeval",
|
||||
name="IFEval",
|
||||
description="Instruction-Following Evaluation for Large Language Models",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="mmlu",
|
||||
name="MMLU",
|
||||
description="Measuring Massive Multitask Language Understanding",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="mmlu_pro",
|
||||
name="MMLU-Pro",
|
||||
description="A More Robust and Challenging Multi-Task Language Understanding Benchmark",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="piqa",
|
||||
name="PIQA",
|
||||
description="Reasoning about Physical Commonsense in Natural Language",
|
||||
),
|
||||
BenchmarkSpecification(
|
||||
task="winogrande",
|
||||
name="WinoGrande",
|
||||
description="An Adversarial Winograd Schema Challenge at Scale",
|
||||
),
|
||||
],
|
||||
description="Benchmarks to offer to the user for evaluating abliterated models.",
|
||||
)
|
||||
|
||||
refusal_markers: list[str] = Field(
|
||||
default=[
|
||||
"sorry",
|
||||
|
||||
+121
-1
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# Copyright (C) 2025-2026 Philipp Emanuel Weidmann <pew@worldwidemann.com> + contributors
|
||||
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
@@ -10,9 +11,13 @@ from dataclasses import asdict
|
||||
from importlib.metadata import version
|
||||
from os.path import commonprefix
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import huggingface_hub
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import optuna
|
||||
import questionary
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
@@ -24,6 +29,7 @@ from accelerate.utils import (
|
||||
is_xpu_available,
|
||||
)
|
||||
from huggingface_hub import ModelCard, ModelCardData
|
||||
from lm_eval.models.huggingface import HFLM
|
||||
from optuna import Trial, TrialPruned
|
||||
from optuna.exceptions import ExperimentalWarning
|
||||
from optuna.samplers import TPESampler
|
||||
@@ -32,7 +38,8 @@ from optuna.storages.journal import JournalFileBackend, JournalFileOpenLock
|
||||
from optuna.study import StudyDirection
|
||||
from optuna.trial import TrialState
|
||||
from pydantic import ValidationError
|
||||
from questionary import Choice
|
||||
from questionary import Choice, Style
|
||||
from rich.table import Table
|
||||
from rich.traceback import install
|
||||
|
||||
from .analyzer import Analyzer
|
||||
@@ -225,6 +232,9 @@ def run():
|
||||
# In my entire career I've never seen a useful warning from that library.
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
# Another library that generates warning spam.
|
||||
logging.getLogger("lm_eval").setLevel(logging.ERROR)
|
||||
|
||||
# We do our own trial logging, so we don't need the INFO messages
|
||||
# about parameters and results.
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
@@ -752,6 +762,7 @@ def run():
|
||||
"Save the model to a local folder",
|
||||
"Upload the model to Hugging Face",
|
||||
"Chat with the model",
|
||||
"Benchmark the model",
|
||||
"Return to the trial selection menu",
|
||||
],
|
||||
)
|
||||
@@ -816,6 +827,8 @@ def run():
|
||||
"Private",
|
||||
],
|
||||
)
|
||||
if visibility is None:
|
||||
continue
|
||||
private = visibility == "Private"
|
||||
|
||||
strategy = obtain_merge_strategy(settings)
|
||||
@@ -913,6 +926,113 @@ def run():
|
||||
# Ctrl+C/Ctrl+D
|
||||
break
|
||||
|
||||
case "Benchmark the model":
|
||||
benchmarks = questionary.checkbox(
|
||||
"Which benchmarks do you want to run?",
|
||||
[
|
||||
Choice(
|
||||
title=f"{benchmark.name}: {benchmark.description}",
|
||||
value=benchmark,
|
||||
)
|
||||
for benchmark in settings.benchmarks
|
||||
],
|
||||
style=Style([("highlighted", "reverse")]),
|
||||
).ask()
|
||||
if not benchmarks:
|
||||
continue
|
||||
|
||||
scope = prompt_select(
|
||||
(
|
||||
"Do you want to benchmark the original model along with the decensored model? "
|
||||
"Benchmarking both models allows you to compare the scores, but it takes twice as much time."
|
||||
),
|
||||
[
|
||||
"Benchmark only the decensored model",
|
||||
"Benchmark both models",
|
||||
],
|
||||
)
|
||||
if scope is None:
|
||||
continue
|
||||
benchmark_original_model = scope == "Benchmark both models"
|
||||
|
||||
hflm = HFLM(
|
||||
pretrained=model.model, # ty:ignore[invalid-argument-type]
|
||||
tokenizer=model.tokenizer, # ty:ignore[invalid-argument-type]
|
||||
)
|
||||
|
||||
table = Table()
|
||||
table.add_column("Benchmark")
|
||||
table.add_column("Metric")
|
||||
if benchmark_original_model:
|
||||
table.add_column("This model", justify="right")
|
||||
table.add_column("Original model", justify="right")
|
||||
else:
|
||||
table.add_column("Value", justify="right")
|
||||
|
||||
try:
|
||||
first_benchmark = True
|
||||
|
||||
for benchmark in benchmarks:
|
||||
print(
|
||||
f"Running benchmark [bold]{benchmark.name}[/]..."
|
||||
)
|
||||
|
||||
def get_results() -> dict[str, Any]:
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=hflm,
|
||||
tasks=[benchmark.task],
|
||||
batch_size="auto",
|
||||
)
|
||||
return results["results"][benchmark.task]
|
||||
|
||||
results = get_results()
|
||||
if benchmark_original_model:
|
||||
with model.model.disable_adapter(): # ty:ignore[call-non-callable]
|
||||
original_results = get_results()
|
||||
|
||||
first_row = True
|
||||
|
||||
for metric, value in results.items():
|
||||
if metric != "alias":
|
||||
if first_row and not first_benchmark:
|
||||
if benchmark_original_model:
|
||||
table.add_row("", "", "", "")
|
||||
else:
|
||||
table.add_row("", "", "")
|
||||
|
||||
def format_value(value: Any) -> str:
|
||||
if isinstance(
|
||||
value,
|
||||
(float, np.floating),
|
||||
):
|
||||
return f"{value:.4f}"
|
||||
else:
|
||||
return f"{value}"
|
||||
|
||||
cells = [
|
||||
benchmark.name if first_row else "",
|
||||
metric,
|
||||
format_value(value),
|
||||
]
|
||||
if benchmark_original_model:
|
||||
cells.append(
|
||||
format_value(
|
||||
original_results[metric]
|
||||
)
|
||||
)
|
||||
table.add_row(*cells)
|
||||
|
||||
first_row = False
|
||||
first_benchmark = False
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
# The benchmark run might have been cancelled by the user
|
||||
# before any benchmark was completed, so we only print results
|
||||
# if there actually are some.
|
||||
if table.rows:
|
||||
print(table)
|
||||
|
||||
except Exception as error:
|
||||
print(f"[red]Error: {error}[/]")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user