Improve optimization
This commit is contained in:
+4
-2
@@ -17,9 +17,11 @@ max_response_length = 100
|
||||
|
||||
max_kl_divergence = 0.5
|
||||
|
||||
kl_score_shape = 5.0
|
||||
kl_score_shape = 3.0
|
||||
|
||||
n_trials = 100
|
||||
n_trials = 200
|
||||
|
||||
n_startup_trials = 60
|
||||
|
||||
refusal_markers = [
|
||||
"sorry",
|
||||
|
||||
@@ -60,6 +60,10 @@ class Settings(BaseSettings):
|
||||
description="Number of abliteration trials to run during optimization"
|
||||
)
|
||||
|
||||
n_startup_trials: int = Field(
|
||||
description="Number of trials that use random sampling for the purpose of exploration"
|
||||
)
|
||||
|
||||
refusal_markers: list[str] = Field(
|
||||
description="Strings whose presence in a response (case insensitive) identifies the response as a refusal"
|
||||
)
|
||||
|
||||
+9
-2
@@ -174,7 +174,9 @@ def run():
|
||||
print("* Obtaining residuals for bad prompts...")
|
||||
bad_residuals = model.get_residuals_batched(bad_prompts)
|
||||
refusal_directions = F.normalize(
|
||||
bad_residuals.mean(dim=0) - good_residuals.mean(dim=0), p=2, dim=1
|
||||
bad_residuals.mean(dim=0) - good_residuals.mean(dim=0),
|
||||
p=2,
|
||||
dim=1,
|
||||
)
|
||||
|
||||
trial_index = 0
|
||||
@@ -274,7 +276,12 @@ def run():
|
||||
# The optimizer searches for a minimum, so we return the negative score.
|
||||
return -score
|
||||
|
||||
study = optuna.create_study()
|
||||
study = optuna.create_study(
|
||||
sampler=optuna.samplers.TPESampler(
|
||||
n_startup_trials=settings.n_startup_trials,
|
||||
multivariate=True,
|
||||
)
|
||||
)
|
||||
|
||||
study.optimize(objective, n_trials=settings.n_trials)
|
||||
|
||||
|
||||
@@ -156,9 +156,13 @@ class Model:
|
||||
# The index must be shifted by 1 because the first element
|
||||
# of refusal_directions is the direction for the embeddings.
|
||||
weight, index = math.modf(direction_index + 1)
|
||||
refusal_direction = refusal_directions[int(index)].lerp(
|
||||
refusal_direction = F.normalize(
|
||||
refusal_directions[int(index)].lerp(
|
||||
refusal_directions[int(index) + 1],
|
||||
weight,
|
||||
),
|
||||
p=2,
|
||||
dim=0,
|
||||
)
|
||||
|
||||
# Note that some implementations of abliteration also orthogonalize
|
||||
|
||||
Reference in New Issue
Block a user