feat: add integrated benchmarking system

2026-03-24 18:25:12 +05:30
parent 19cdf7e244
commit 1126332281
5 changed files with 733 additions and 4 deletions
@@ -3,9 +3,11 @@

 from pathlib import Path

+import numpy as np
 import torch
 import torch.linalg as LA
 import torch.nn.functional as F
+from numpy.typing import NDArray
 from rich.progress import track
 from rich.table import Table
 from torch import Tensor
@@ -156,11 +158,9 @@ class Analyzer:
        try:
            import imageio.v3 as iio  # ty:ignore[unresolved-import]
            import matplotlib.pyplot as plt  # ty:ignore[unresolved-import]
-            import numpy as np  # ty:ignore[unresolved-import]
            from geom_median.numpy import (  # ty:ignore[unresolved-import]
                compute_geometric_median,
            )
-            from numpy.typing import NDArray  # ty:ignore[unresolved-import]
            from pacmap import PaCMAP  # ty:ignore[unresolved-import]
        except ImportError:
            print()
@@ -61,6 +61,18 @@ class DatasetSpecification(BaseModel):
    )


+class BenchmarkSpecification(BaseModel):
+    task: str = Field(
+        description="Task ID of the benchmark in the Language Model Evaluation Harness."
+    )
+
+    name: str = Field(description="Name of the benchmark for presentation purposes.")
+
+    description: str = Field(
+        description="Description of the benchmark for presentation purposes."
+    )
+
+
 class Settings(BaseSettings):
    model: str = Field(description="Hugging Face model ID, or path to model on disk.")

@@ -230,6 +242,67 @@ class Settings(BaseSettings):
        description="Directory to save and load study progress to/from.",
    )

+    benchmarks: list[BenchmarkSpecification] = Field(
+        default=[
+            BenchmarkSpecification(
+                task="agieval",
+                name="AGIEval",
+                description="A Human-Centric Benchmark for Evaluating Foundation Models",
+            ),
+            BenchmarkSpecification(
+                task="bbh",
+                name="BIG-Bench Hard (BBH)",
+                description="Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them",
+            ),
+            BenchmarkSpecification(
+                task="commonsense_qa",
+                name="CommonsenseQA",
+                description="A Question Answering Challenge Targeting Commonsense Knowledge",
+            ),
+            BenchmarkSpecification(
+                task="eq_bench",
+                name="EQ-Bench",
+                description="An Emotional Intelligence Benchmark for Large Language Models",
+            ),
+            BenchmarkSpecification(
+                task="gsm8k",
+                name="GSM8K",
+                description="Training Verifiers to Solve Math Word Problems",
+            ),
+            BenchmarkSpecification(
+                task="hellaswag",
+                name="HellaSwag",
+                description="Can a Machine Really Finish Your Sentence?",
+            ),
+            BenchmarkSpecification(
+                task="ifeval",
+                name="IFEval",
+                description="Instruction-Following Evaluation for Large Language Models",
+            ),
+            BenchmarkSpecification(
+                task="mmlu",
+                name="MMLU",
+                description="Measuring Massive Multitask Language Understanding",
+            ),
+            BenchmarkSpecification(
+                task="mmlu_pro",
+                name="MMLU-Pro",
+                description="A More Robust and Challenging Multi-Task Language Understanding Benchmark",
+            ),
+            BenchmarkSpecification(
+                task="piqa",
+                name="PIQA",
+                description="Reasoning about Physical Commonsense in Natural Language",
+            ),
+            BenchmarkSpecification(
+                task="winogrande",
+                name="WinoGrande",
+                description="An Adversarial Winograd Schema Challenge at Scale",
+            ),
+        ],
+        description="Benchmarks to offer to the user for evaluating abliterated models.",
+    )
+
    refusal_markers: list[str] = Field(
        default=[
            "sorry",
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # Copyright (C) 2025-2026  Philipp Emanuel Weidmann <pew@worldwidemann.com> + contributors

+import logging
 import math
 import os
 import sys
@@ -10,9 +11,13 @@ from dataclasses import asdict
 from importlib.metadata import version
 from os.path import commonprefix
 from pathlib import Path
+from typing import Any

 import huggingface_hub
+import lm_eval
+import numpy as np
 import optuna
+import questionary
 import torch
 import torch.nn.functional as F
 import transformers
@@ -24,6 +29,7 @@ from accelerate.utils import (
    is_xpu_available,
 )
 from huggingface_hub import ModelCard, ModelCardData
+from lm_eval.models.huggingface import HFLM
 from optuna import Trial, TrialPruned
 from optuna.exceptions import ExperimentalWarning
 from optuna.samplers import TPESampler
@@ -32,7 +38,8 @@ from optuna.storages.journal import JournalFileBackend, JournalFileOpenLock
 from optuna.study import StudyDirection
 from optuna.trial import TrialState
 from pydantic import ValidationError
-from questionary import Choice
+from questionary import Choice, Style
+from rich.table import Table
 from rich.traceback import install

 from .analyzer import Analyzer
@@ -225,6 +232,9 @@ def run():
    # In my entire career I've never seen a useful warning from that library.
    transformers.logging.set_verbosity_error()

+    # Another library that generates warning spam.
+    logging.getLogger("lm_eval").setLevel(logging.ERROR)
+
    # We do our own trial logging, so we don't need the INFO messages
    # about parameters and results.
    optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -752,6 +762,7 @@ def run():
                        "Save the model to a local folder",
                        "Upload the model to Hugging Face",
                        "Chat with the model",
+                        "Benchmark the model",
                        "Return to the trial selection menu",
                    ],
                )
@@ -816,6 +827,8 @@ def run():
                                    "Private",
                                ],
                            )
+                            if visibility is None:
+                                continue
                            private = visibility == "Private"

                            strategy = obtain_merge_strategy(settings)
@@ -913,6 +926,113 @@ def run():
                                    # Ctrl+C/Ctrl+D
                                    break

+                        case "Benchmark the model":
+                            benchmarks = questionary.checkbox(
+                                "Which benchmarks do you want to run?",
+                                [
+                                    Choice(
+                                        title=f"{benchmark.name}: {benchmark.description}",
+                                        value=benchmark,
+                                    )
+                                    for benchmark in settings.benchmarks
+                                ],
+                                style=Style([("highlighted", "reverse")]),
+                            ).ask()
+                            if not benchmarks:
+                                continue
+
+                            scope = prompt_select(
+                                (
+                                    "Do you want to benchmark the original model along with the decensored model? "
+                                    "Benchmarking both models allows you to compare the scores, but it takes twice as much time."
+                                ),
+                                [
+                                    "Benchmark only the decensored model",
+                                    "Benchmark both models",
+                                ],
+                            )
+                            if scope is None:
+                                continue
+                            benchmark_original_model = scope == "Benchmark both models"
+
+                            hflm = HFLM(
+                                pretrained=model.model,  # ty:ignore[invalid-argument-type]
+                                tokenizer=model.tokenizer,  # ty:ignore[invalid-argument-type]
+                            )
+
+                            table = Table()
+                            table.add_column("Benchmark")
+                            table.add_column("Metric")
+                            if benchmark_original_model:
+                                table.add_column("This model", justify="right")
+                                table.add_column("Original model", justify="right")
+                            else:
+                                table.add_column("Value", justify="right")
+
+                            try:
+                                first_benchmark = True
+
+                                for benchmark in benchmarks:
+                                    print(
+                                        f"Running benchmark [bold]{benchmark.name}[/]..."
+                                    )
+
+                                    def get_results() -> dict[str, Any]:
+                                        results = lm_eval.simple_evaluate(
+                                            model=hflm,
+                                            tasks=[benchmark.task],
+                                            batch_size="auto",
+                                        )
+                                        return results["results"][benchmark.task]
+
+                                    results = get_results()
+                                    if benchmark_original_model:
+                                        with model.model.disable_adapter():  # ty:ignore[call-non-callable]
+                                            original_results = get_results()
+
+                                    first_row = True
+
+                                    for metric, value in results.items():
+                                        if metric != "alias":
+                                            if first_row and not first_benchmark:
+                                                if benchmark_original_model:
+                                                    table.add_row("", "", "", "")
+                                                else:
+                                                    table.add_row("", "", "")
+
+                                            def format_value(value: Any) -> str:
+                                                if isinstance(
+                                                    value,
+                                                    (float, np.floating),
+                                                ):
+                                                    return f"{value:.4f}"
+                                                else:
+                                                    return f"{value}"
+
+                                            cells = [
+                                                benchmark.name if first_row else "",
+                                                metric,
+                                                format_value(value),
+                                            ]
+                                            if benchmark_original_model:
+                                                cells.append(
+                                                    format_value(
+                                                        original_results[metric]
+                                                    )
+                                                )
+                                            table.add_row(*cells)
+
+                                            first_row = False
+                                            first_benchmark = False
+                            except KeyboardInterrupt:
+                                pass
+
+                            # The benchmark run might have been cancelled by the user
+                            # before any benchmark was completed, so we only print results
+                            # if there actually are some.
+                            if table.rows:
+                                print(table)
+
                except Exception as error:
                    print(f"[red]Error: {error}[/]")