Add functionality to evaluate any model relative to the main model
This commit is contained in:
@@ -23,6 +23,11 @@ class DatasetSpecification(BaseModel):
|
|||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
model: str = Field(description="Hugging Face model ID, or path to model on disk")
|
model: str = Field(description="Hugging Face model ID, or path to model on disk")
|
||||||
|
|
||||||
|
evaluate_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="If this model ID or path is set, then instead of abliterating the main model, evaluate this model relative to the main model",
|
||||||
|
)
|
||||||
|
|
||||||
dtypes: list[str] = Field(
|
dtypes: list[str] = Field(
|
||||||
description="List of PyTorch dtypes to try when loading model tensors. If loading with a dtype fails, the next dtype in the list will be tried."
|
description="List of PyTorch dtypes to try when loading model tensors. If loading with a dtype fails, the next dtype in the list will be tried."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -73,7 +73,10 @@ class Evaluator:
|
|||||||
** self.settings.kl_score_shape
|
** self.settings.kl_score_shape
|
||||||
)
|
)
|
||||||
|
|
||||||
if kl_divergence > self.settings.max_kl_divergence:
|
if (
|
||||||
|
self.settings.evaluate_model is None
|
||||||
|
and kl_divergence > self.settings.max_kl_divergence
|
||||||
|
):
|
||||||
print(" [yellow](constraint violation; aborting trial)[/]")
|
print(" [yellow](constraint violation; aborting trial)[/]")
|
||||||
return kl_score, kl_divergence, self.base_refusals
|
return kl_score, kl_divergence, self.base_refusals
|
||||||
else:
|
else:
|
||||||
|
|||||||
+11
-2
@@ -156,6 +156,17 @@ def run():
|
|||||||
settings.batch_size = best_batch_size
|
settings.batch_size = best_batch_size
|
||||||
print(f"* Chosen batch size: [bold]{settings.batch_size}[/]")
|
print(f"* Chosen batch size: [bold]{settings.batch_size}[/]")
|
||||||
|
|
||||||
|
evaluator = Evaluator(settings, model)
|
||||||
|
|
||||||
|
if settings.evaluate_model is not None:
|
||||||
|
print()
|
||||||
|
print(f"Loading model [bold]{settings.evaluate_model}[/]...")
|
||||||
|
settings.model = settings.evaluate_model
|
||||||
|
model.reload_model()
|
||||||
|
print("* Evaluating...")
|
||||||
|
evaluator.get_score()
|
||||||
|
return
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("Calculating per-layer refusal directions...")
|
print("Calculating per-layer refusal directions...")
|
||||||
print("* Obtaining residuals for good prompts...")
|
print("* Obtaining residuals for good prompts...")
|
||||||
@@ -166,8 +177,6 @@ def run():
|
|||||||
bad_residuals.mean(dim=0) - good_residuals.mean(dim=0), p=2, dim=1
|
bad_residuals.mean(dim=0) - good_residuals.mean(dim=0), p=2, dim=1
|
||||||
)
|
)
|
||||||
|
|
||||||
evaluator = Evaluator(settings, model)
|
|
||||||
|
|
||||||
trial_index = 0
|
trial_index = 0
|
||||||
|
|
||||||
def objective(trial: optuna.Trial):
|
def objective(trial: optuna.Trial):
|
||||||
|
|||||||
Reference in New Issue
Block a user