From 850c21b5347d2f2eb4f9ca9be1bc77732d5a441f Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Sat, 1 Nov 2025 16:57:12 +0530
Subject: [PATCH] Make multivariate TPE work properly

---
 src/heretic/main.py  | 63 ++++++++++++++++++++++++++------------------
 src/heretic/utils.py | 24 ++++++++++++-----
 2 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index bb86938..42b91b4 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -4,6 +4,7 @@
 import math
 import sys
 import time
+import warnings
 from importlib.metadata import version
 from pathlib import Path
 
@@ -27,7 +28,13 @@ from rich.traceback import install
 from .config import Settings
 from .evaluator import Evaluator
 from .model import AbliterationParameters, Model
-from .utils import format_duration, get_readme_intro, load_prompts, print
+from .utils import (
+    format_duration,
+    get_readme_intro,
+    get_trial_parameters,
+    load_prompts,
+    print,
+)
 
 
 def run():
@@ -98,6 +105,9 @@ def run():
     # about parameters and results.
     optuna.logging.set_verbosity(optuna.logging.WARNING)
 
+    # Silence the warning about multivariate TPE being experimental.
+    warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
+
     model = Model(settings)
 
     print()
@@ -195,16 +205,20 @@ def run():
             ],
         )
 
-        if direction_scope == "global":
-            # Discrimination between "harmful" and "harmless" inputs is usually strongest
-            # in layers slightly past the midpoint of the layer stack. See the original
-            # abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
-            direction_index = trial.suggest_float(
-                "direction_index",
-                0.4 * (len(model.get_layers()) - 1),
-                0.9 * (len(model.get_layers()) - 1),
-            )
-        else:
+        # Discrimination between "harmful" and "harmless" inputs is usually strongest
+        # in layers slightly past the midpoint of the layer stack. See the original
+        # abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
+        #
+        # Note that we always sample this parameter even though we only need it for
+        # the "global" direction scope. The reason is that multivariate TPE doesn't
+        # work with conditional or variable-range parameters.
+        direction_index = trial.suggest_float(
+            "direction_index",
+            0.4 * (len(model.get_layers()) - 1),
+            0.9 * (len(model.get_layers()) - 1),
+        )
+
+        if direction_scope == "per layer":
             direction_index = None
 
         parameters = {}
@@ -223,10 +237,13 @@ def run():
                 0.6 * (len(model.get_layers()) - 1),
                 len(model.get_layers()) - 1,
             )
+            # For sampling purposes, min_weight is expressed as a fraction of max_weight,
+            # again because multivariate TPE doesn't support variable-range parameters.
+            # The value is transformed into the actual min_weight value below.
             min_weight = trial.suggest_float(
                 f"{component}.min_weight",
                 0.0,
-                max_weight,
+                1.0,
             )
             min_weight_distance = trial.suggest_float(
                 f"{component}.min_weight_distance",
@@ -237,20 +254,20 @@ def run():
             parameters[component] = AbliterationParameters(
                 max_weight=max_weight,
                 max_weight_position=max_weight_position,
-                min_weight=min_weight,
+                min_weight=(min_weight * max_weight),
                 min_weight_distance=min_weight_distance,
             )
 
+        trial.set_user_attr("direction_index", direction_index)
+        trial.set_user_attr("parameters", parameters)
+
         print()
         print(
             f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
         )
         print("* Parameters:")
-        for name, value in trial.params.items():
-            if isinstance(value, float):
-                print(f"  * {name} = [bold]{value:.4f}[/]")
-            else:
-                print(f"  * {name} = [bold]{value}[/]")
+        for name, value in get_trial_parameters(trial).items():
+            print(f"  * {name} = [bold]{value}[/]")
         print("* Reloading model...")
         model.reload_model()
         print("* Abliterating...")
@@ -271,7 +288,6 @@ def run():
 
         trial.set_user_attr("kl_divergence", kl_divergence)
         trial.set_user_attr("refusals", refusals)
-        trial.set_user_attr("parameters", parameters)
 
         # The optimizer searches for a minimum, so we return the negative score.
         return -score
@@ -290,11 +306,8 @@ def run():
         f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
     )
     print("* Parameters:")
-    for name, value in study.best_params.items():
-        if isinstance(value, float):
-            print(f"  * {name} = [bold]{value:.4f}[/]")
-        else:
-            print(f"  * {name} = [bold]{value}[/]")
+    for name, value in get_trial_parameters(study.best_trial).items():
+        print(f"  * {name} = [bold]{value}[/]")
     print("* Results:")
     print(
         f"  * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
@@ -312,7 +325,7 @@ def run():
     print("* Abliterating...")
     model.abliterate(
         refusal_directions,
-        study.best_params.get("direction_index", None),
+        study.best_trial.user_attrs["direction_index"],
         study.best_trial.user_attrs["parameters"],
     )
 
diff --git a/src/heretic/utils.py b/src/heretic/utils.py
index 38a054f..a107812 100644
--- a/src/heretic/utils.py
+++ b/src/heretic/utils.py
@@ -2,6 +2,7 @@
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>
 
 import gc
+from dataclasses import asdict
 from importlib.metadata import version
 from typing import TypeVar
 
@@ -61,6 +62,21 @@ def empty_cache():
     gc.collect()
 
 
+def get_trial_parameters(trial: optuna.Trial) -> dict[str, str]:
+    params = {}
+
+    direction_index = trial.user_attrs["direction_index"]
+    params["direction_index"] = (
+        "per layer" if (direction_index is None) else f"{direction_index:.4f}"
+    )
+
+    for component, parameters in trial.user_attrs["parameters"].items():
+        for name, value in asdict(parameters).items():
+            params[f"{component}.{name}"] = f"{value:.4f}"
+
+    return params
+
+
 def get_readme_intro(
     settings: Settings,
     study: optuna.Study,
@@ -84,12 +100,8 @@ def get_readme_intro(
 {
         chr(10).join(
             [
-                (
-                    f"| **{name}** | {value:.4f} |"
-                    if isinstance(value, float)
-                    else f"| **{name}** | {value} |"
-                )
-                for name, value in study.best_params.items()
+                f"| **{name}** | {value} |"
+                for name, value in get_trial_parameters(study.best_trial).items()
             ]
         )
     }