Add functionality to evaluate any model relative to the main model

2025-10-24 13:38:03 +05:30
parent e6aba71186
commit cf57a0cfbe
3 changed files with 20 additions and 3 deletions
@@ -23,6 +23,11 @@ class DatasetSpecification(BaseModel):
 class Settings(BaseSettings):
    model: str = Field(description="Hugging Face model ID, or path to model on disk")
    evaluate_model: str | None = Field(
        default=None,
        description="If this model ID or path is set, then instead of abliterating the main model, evaluate this model relative to the main model",
    )
    dtypes: list[str] = Field(
        description="List of PyTorch dtypes to try when loading model tensors. If loading with a dtype fails, the next dtype in the list will be tried."
    )
@@ -73,7 +73,10 @@ class Evaluator:
            ** self.settings.kl_score_shape
        )
-        if kl_divergence > self.settings.max_kl_divergence:
+        if (
            self.settings.evaluate_model is None
            and kl_divergence > self.settings.max_kl_divergence
        ):
            print(" [yellow](constraint violation; aborting trial)[/]")
            return kl_score, kl_divergence, self.base_refusals
        else:
@@ -156,6 +156,17 @@ def run():
        settings.batch_size = best_batch_size
        print(f"* Chosen batch size: [bold]{settings.batch_size}[/]")
    evaluator = Evaluator(settings, model)
    if settings.evaluate_model is not None:
        print()
        print(f"Loading model [bold]{settings.evaluate_model}[/]...")
        settings.model = settings.evaluate_model
        model.reload_model()
        print("* Evaluating...")
        evaluator.get_score()
        return
    print()
    print("Calculating per-layer refusal directions...")
    print("* Obtaining residuals for good prompts...")
@@ -166,8 +177,6 @@ def run():
        bad_residuals.mean(dim=0) - good_residuals.mean(dim=0), p=2, dim=1
    )
    evaluator = Evaluator(settings, model)
    trial_index = 0
    def objective(trial: optuna.Trial):