Add functionality to evaluate any model relative to the main model

2025-10-24 13:38:03 +05:30
parent e6aba71186
commit cf57a0cfbe
3 changed files with 20 additions and 3 deletions
@@ -23,6 +23,11 @@ class DatasetSpecification(BaseModel):
 class Settings(BaseSettings):
    model: str = Field(description="Hugging Face model ID, or path to model on disk")

+    evaluate_model: str | None = Field(
+        default=None,
+        description="If this model ID or path is set, then instead of abliterating the main model, evaluate this model relative to the main model",
+    )
+
    dtypes: list[str] = Field(
        description="List of PyTorch dtypes to try when loading model tensors. If loading with a dtype fails, the next dtype in the list will be tried."
    )
@@ -73,7 +73,10 @@ class Evaluator:
            ** self.settings.kl_score_shape
        )

-        if kl_divergence > self.settings.max_kl_divergence:
+        if (
+            self.settings.evaluate_model is None
+            and kl_divergence > self.settings.max_kl_divergence
+        ):
            print(" [yellow](constraint violation; aborting trial)[/]")
            return kl_score, kl_divergence, self.base_refusals
        else:
@@ -156,6 +156,17 @@ def run():
        settings.batch_size = best_batch_size
        print(f"* Chosen batch size: [bold]{settings.batch_size}[/]")

+    evaluator = Evaluator(settings, model)
+
+    if settings.evaluate_model is not None:
+        print()
+        print(f"Loading model [bold]{settings.evaluate_model}[/]...")
+        settings.model = settings.evaluate_model
+        model.reload_model()
+        print("* Evaluating...")
+        evaluator.get_score()
+        return
+
    print()
    print("Calculating per-layer refusal directions...")
    print("* Obtaining residuals for good prompts...")
@@ -166,8 +177,6 @@ def run():
        bad_residuals.mean(dim=0) - good_residuals.mean(dim=0), p=2, dim=1
    )

-    evaluator = Evaluator(settings, model)
-
    trial_index = 0

    def objective(trial: optuna.Trial):