feat: add integrated benchmarking system

This commit is contained in:
Philipp Emanuel Weidmann
2026-03-24 18:25:12 +05:30
parent 19cdf7e244
commit 1126332281
5 changed files with 733 additions and 4 deletions
+2 -2
View File
@@ -3,9 +3,11 @@
from pathlib import Path
import numpy as np
import torch
import torch.linalg as LA
import torch.nn.functional as F
from numpy.typing import NDArray
from rich.progress import track
from rich.table import Table
from torch import Tensor
@@ -156,11 +158,9 @@ class Analyzer:
try:
import imageio.v3 as iio # ty:ignore[unresolved-import]
import matplotlib.pyplot as plt # ty:ignore[unresolved-import]
import numpy as np # ty:ignore[unresolved-import]
from geom_median.numpy import ( # ty:ignore[unresolved-import]
compute_geometric_median,
)
from numpy.typing import NDArray # ty:ignore[unresolved-import]
from pacmap import PaCMAP # ty:ignore[unresolved-import]
except ImportError:
print()
+73
View File
@@ -61,6 +61,18 @@ class DatasetSpecification(BaseModel):
)
class BenchmarkSpecification(BaseModel):
task: str = Field(
description="Task ID of the benchmark in the Language Model Evaluation Harness."
)
name: str = Field(description="Name of the benchmark for presentation purposes.")
description: str = Field(
description="Description of the benchmark for presentation purposes."
)
class Settings(BaseSettings):
model: str = Field(description="Hugging Face model ID, or path to model on disk.")
@@ -230,6 +242,67 @@ class Settings(BaseSettings):
description="Directory to save and load study progress to/from.",
)
benchmarks: list[BenchmarkSpecification] = Field(
default=[
BenchmarkSpecification(
task="agieval",
name="AGIEval",
description="A Human-Centric Benchmark for Evaluating Foundation Models",
),
BenchmarkSpecification(
task="bbh",
name="BIG-Bench Hard (BBH)",
description="Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them",
),
BenchmarkSpecification(
task="commonsense_qa",
name="CommonsenseQA",
description="A Question Answering Challenge Targeting Commonsense Knowledge",
),
BenchmarkSpecification(
task="eq_bench",
name="EQ-Bench",
description="An Emotional Intelligence Benchmark for Large Language Models",
),
BenchmarkSpecification(
task="gsm8k",
name="GSM8K",
description="Training Verifiers to Solve Math Word Problems",
),
BenchmarkSpecification(
task="hellaswag",
name="HellaSwag",
description="Can a Machine Really Finish Your Sentence?",
),
BenchmarkSpecification(
task="ifeval",
name="IFEval",
description="Instruction-Following Evaluation for Large Language Models",
),
BenchmarkSpecification(
task="mmlu",
name="MMLU",
description="Measuring Massive Multitask Language Understanding",
),
BenchmarkSpecification(
task="mmlu_pro",
name="MMLU-Pro",
description="A More Robust and Challenging Multi-Task Language Understanding Benchmark",
),
BenchmarkSpecification(
task="piqa",
name="PIQA",
description="Reasoning about Physical Commonsense in Natural Language",
),
BenchmarkSpecification(
task="winogrande",
name="WinoGrande",
description="An Adversarial Winograd Schema Challenge at Scale",
),
],
description="Benchmarks to offer to the user for evaluating abliterated models.",
)
refusal_markers: list[str] = Field(
default=[
"sorry",
+121 -1
View File
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2025-2026 Philipp Emanuel Weidmann <pew@worldwidemann.com> + contributors
import logging
import math
import os
import sys
@@ -10,9 +11,13 @@ from dataclasses import asdict
from importlib.metadata import version
from os.path import commonprefix
from pathlib import Path
from typing import Any
import huggingface_hub
import lm_eval
import numpy as np
import optuna
import questionary
import torch
import torch.nn.functional as F
import transformers
@@ -24,6 +29,7 @@ from accelerate.utils import (
is_xpu_available,
)
from huggingface_hub import ModelCard, ModelCardData
from lm_eval.models.huggingface import HFLM
from optuna import Trial, TrialPruned
from optuna.exceptions import ExperimentalWarning
from optuna.samplers import TPESampler
@@ -32,7 +38,8 @@ from optuna.storages.journal import JournalFileBackend, JournalFileOpenLock
from optuna.study import StudyDirection
from optuna.trial import TrialState
from pydantic import ValidationError
from questionary import Choice
from questionary import Choice, Style
from rich.table import Table
from rich.traceback import install
from .analyzer import Analyzer
@@ -225,6 +232,9 @@ def run():
# In my entire career I've never seen a useful warning from that library.
transformers.logging.set_verbosity_error()
# Another library that generates warning spam.
logging.getLogger("lm_eval").setLevel(logging.ERROR)
# We do our own trial logging, so we don't need the INFO messages
# about parameters and results.
optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -752,6 +762,7 @@ def run():
"Save the model to a local folder",
"Upload the model to Hugging Face",
"Chat with the model",
"Benchmark the model",
"Return to the trial selection menu",
],
)
@@ -816,6 +827,8 @@ def run():
"Private",
],
)
if visibility is None:
continue
private = visibility == "Private"
strategy = obtain_merge_strategy(settings)
@@ -913,6 +926,113 @@ def run():
# Ctrl+C/Ctrl+D
break
case "Benchmark the model":
benchmarks = questionary.checkbox(
"Which benchmarks do you want to run?",
[
Choice(
title=f"{benchmark.name}: {benchmark.description}",
value=benchmark,
)
for benchmark in settings.benchmarks
],
style=Style([("highlighted", "reverse")]),
).ask()
if not benchmarks:
continue
scope = prompt_select(
(
"Do you want to benchmark the original model along with the decensored model? "
"Benchmarking both models allows you to compare the scores, but it takes twice as much time."
),
[
"Benchmark only the decensored model",
"Benchmark both models",
],
)
if scope is None:
continue
benchmark_original_model = scope == "Benchmark both models"
hflm = HFLM(
pretrained=model.model, # ty:ignore[invalid-argument-type]
tokenizer=model.tokenizer, # ty:ignore[invalid-argument-type]
)
table = Table()
table.add_column("Benchmark")
table.add_column("Metric")
if benchmark_original_model:
table.add_column("This model", justify="right")
table.add_column("Original model", justify="right")
else:
table.add_column("Value", justify="right")
try:
first_benchmark = True
for benchmark in benchmarks:
print(
f"Running benchmark [bold]{benchmark.name}[/]..."
)
def get_results() -> dict[str, Any]:
results = lm_eval.simple_evaluate(
model=hflm,
tasks=[benchmark.task],
batch_size="auto",
)
return results["results"][benchmark.task]
results = get_results()
if benchmark_original_model:
with model.model.disable_adapter(): # ty:ignore[call-non-callable]
original_results = get_results()
first_row = True
for metric, value in results.items():
if metric != "alias":
if first_row and not first_benchmark:
if benchmark_original_model:
table.add_row("", "", "", "")
else:
table.add_row("", "", "")
def format_value(value: Any) -> str:
if isinstance(
value,
(float, np.floating),
):
return f"{value:.4f}"
else:
return f"{value}"
cells = [
benchmark.name if first_row else "",
metric,
format_value(value),
]
if benchmark_original_model:
cells.append(
format_value(
original_results[metric]
)
)
table.add_row(*cells)
first_row = False
first_benchmark = False
except KeyboardInterrupt:
pass
# The benchmark run might have been cancelled by the user
# before any benchmark was completed, so we only print results
# if there actually are some.
if table.rows:
print(table)
except Exception as error:
print(f"[red]Error: {error}[/]")