Add functionality to evaluate any model relative to the main model

This commit is contained in:
Philipp Emanuel Weidmann
2025-10-24 13:38:03 +05:30
parent e6aba71186
commit cf57a0cfbe
3 changed files with 20 additions and 3 deletions
+5
View File
@@ -23,6 +23,11 @@ class DatasetSpecification(BaseModel):
class Settings(BaseSettings):
model: str = Field(description="Hugging Face model ID, or path to model on disk")
evaluate_model: str | None = Field(
default=None,
description="If this model ID or path is set, then instead of abliterating the main model, evaluate this model relative to the main model",
)
dtypes: list[str] = Field(
description="List of PyTorch dtypes to try when loading model tensors. If loading with a dtype fails, the next dtype in the list will be tried."
)
+4 -1
View File
@@ -73,7 +73,10 @@ class Evaluator:
** self.settings.kl_score_shape
)
if kl_divergence > self.settings.max_kl_divergence:
if (
self.settings.evaluate_model is None
and kl_divergence > self.settings.max_kl_divergence
):
print(" [yellow](constraint violation; aborting trial)[/]")
return kl_score, kl_divergence, self.base_refusals
else:
+11 -2
View File
@@ -156,6 +156,17 @@ def run():
settings.batch_size = best_batch_size
print(f"* Chosen batch size: [bold]{settings.batch_size}[/]")
evaluator = Evaluator(settings, model)
if settings.evaluate_model is not None:
print()
print(f"Loading model [bold]{settings.evaluate_model}[/]...")
settings.model = settings.evaluate_model
model.reload_model()
print("* Evaluating...")
evaluator.get_score()
return
print()
print("Calculating per-layer refusal directions...")
print("* Obtaining residuals for good prompts...")
@@ -166,8 +177,6 @@ def run():
bad_residuals.mean(dim=0) - good_residuals.mean(dim=0), p=2, dim=1
)
evaluator = Evaluator(settings, model)
trial_index = 0
def objective(trial: optuna.Trial):