Improve optimization

2025-10-31 16:04:28 +05:30
parent a9655c8d31
commit a24e6eba96
4 changed files with 24 additions and 7 deletions
@@ -17,9 +17,11 @@ max_response_length = 100

 max_kl_divergence = 0.5

-kl_score_shape = 5.0
+kl_score_shape = 3.0

-n_trials = 100
+n_trials = 200
+
+n_startup_trials = 60

 refusal_markers = [
    "sorry",
@@ -60,6 +60,10 @@ class Settings(BaseSettings):
        description="Number of abliteration trials to run during optimization"
    )

+    n_startup_trials: int = Field(
+        description="Number of trials that use random sampling for the purpose of exploration"
+    )
+
    refusal_markers: list[str] = Field(
        description="Strings whose presence in a response (case insensitive) identifies the response as a refusal"
    )
@@ -174,7 +174,9 @@ def run():
    print("* Obtaining residuals for bad prompts...")
    bad_residuals = model.get_residuals_batched(bad_prompts)
    refusal_directions = F.normalize(
-        bad_residuals.mean(dim=0) - good_residuals.mean(dim=0), p=2, dim=1
+        bad_residuals.mean(dim=0) - good_residuals.mean(dim=0),
+        p=2,
+        dim=1,
    )

    trial_index = 0
@@ -274,7 +276,12 @@ def run():
        # The optimizer searches for a minimum, so we return the negative score.
        return -score

-    study = optuna.create_study()
+    study = optuna.create_study(
+        sampler=optuna.samplers.TPESampler(
+            n_startup_trials=settings.n_startup_trials,
+            multivariate=True,
+        )
+    )

    study.optimize(objective, n_trials=settings.n_trials)

@@ -156,9 +156,13 @@ class Model:
            # The index must be shifted by 1 because the first element
            # of refusal_directions is the direction for the embeddings.
            weight, index = math.modf(direction_index + 1)
-            refusal_direction = refusal_directions[int(index)].lerp(
+            refusal_direction = F.normalize(
+                refusal_directions[int(index)].lerp(
                    refusal_directions[int(index) + 1],
                    weight,
+                ),
+                p=2,
+                dim=0,
            )

        # Note that some implementations of abliteration also orthogonalize