From 1496e0a04c3a782d34e1b4be1f637c871c6923be Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Fri, 31 Oct 2025 13:04:45 +0530
Subject: [PATCH] Dynamically choose between global and per-layer refusal
 directions

---
 src/heretic/main.py  | 40 +++++++++++++++++++++++++++++++++++-----
 src/heretic/model.py | 27 +++++++++++++++++++++++----
 src/heretic/utils.py |  6 +++++-
 3 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index 98da8dc..b96aaeb 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -185,6 +185,26 @@ def run():
         trial_index += 1
         trial.set_user_attr("index", trial_index)
 
+        direction_scope = trial.suggest_categorical(
+            "direction_scope",
+            [
+                "global",
+                "per layer",
+            ],
+        )
+
+        if direction_scope == "global":
+            # Discrimination between "harmful" and "harmless" inputs is usually strongest
+            # in layers slightly past the midpoint of the layer stack. See the original
+            # abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
+            direction_index = trial.suggest_float(
+                "direction_index",
+                0.4 * (len(model.get_layers()) - 1),
+                0.9 * (len(model.get_layers()) - 1),
+            )
+        else:
+            direction_index = None
+
         parameters = {}
 
         for component in model.get_abliterable_components():
@@ -194,7 +214,7 @@ def run():
             max_weight = trial.suggest_float(
                 f"{component}.max_weight",
                 0.8,
-                1.2,
+                1.5,
             )
             max_weight_position = trial.suggest_float(
                 f"{component}.max_weight_position",
@@ -225,11 +245,14 @@ def run():
         )
         print("* Parameters:")
         for name, value in trial.params.items():
-            print(f"  * {name} = [bold]{value:.4f}[/]")
+            if isinstance(value, float):
+                print(f"  * {name} = [bold]{value:.4f}[/]")
+            else:
+                print(f"  * {name} = [bold]{value}[/]")
         print("* Reloading model...")
         model.reload_model()
         print("* Abliterating...")
-        model.abliterate(refusal_directions, parameters)
+        model.abliterate(refusal_directions, direction_index, parameters)
         print("* Evaluating...")
         score, kl_divergence, refusals = evaluator.get_score()
 
@@ -261,7 +284,10 @@ def run():
     )
     print("* Parameters:")
     for name, value in study.best_params.items():
-        print(f"  * {name} = [bold]{value:.4f}[/]")
+        if isinstance(value, float):
+            print(f"  * {name} = [bold]{value:.4f}[/]")
+        else:
+            print(f"  * {name} = [bold]{value}[/]")
     print("* Results:")
     print(
         f"  * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
@@ -277,7 +303,11 @@ def run():
     print("* Reloading model...")
     model.reload_model()
     print("* Abliterating...")
-    model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"])
+    model.abliterate(
+        refusal_directions,
+        study.best_params.get("direction_index", None),
+        study.best_trial.user_attrs["parameters"],
+    )
 
     while True:
         print()
diff --git a/src/heretic/model.py b/src/heretic/model.py
index 2042172..6419550 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>
 
+import math
 from contextlib import suppress
 from dataclasses import dataclass
 from typing import Any
@@ -146,8 +147,20 @@ class Model:
     def abliterate(
         self,
         refusal_directions: torch.Tensor,
+        direction_index: float | None,
         parameters: dict[str, AbliterationParameters],
     ):
+        if direction_index is None:
+            refusal_direction = None
+        else:
+            # The index must be shifted by 1 because the first element
+            # of refusal_directions is the direction for the embeddings.
+            weight, index = math.modf(direction_index + 1)
+            refusal_direction = refusal_directions[int(index)].lerp(
+                refusal_directions[int(index) + 1],
+                weight,
+            )
+
         # Note that some implementations of abliteration also orthogonalize
         # the embedding matrix, but it's unclear if that has any benefits.
         for layer_index in range(len(self.get_layers())):
@@ -167,13 +180,19 @@ class Model:
                     params.min_weight - params.max_weight
                 )
 
-                # The index must be shifted by 1 because the first element
-                # of refusal_directions is the direction for the embeddings.
-                refusal_direction = refusal_directions[layer_index + 1]
+                if refusal_direction is None:
+                    # The index must be shifted by 1 because the first element
+                    # of refusal_directions is the direction for the embeddings.
+                    layer_refusal_direction = refusal_directions[layer_index + 1]
+                else:
+                    layer_refusal_direction = refusal_direction
 
                 # Projects any right-multiplied vector(s) onto the subspace
                 # spanned by the refusal direction.
-                projector = torch.outer(refusal_direction, refusal_direction)
+                projector = torch.outer(
+                    layer_refusal_direction,
+                    layer_refusal_direction,
+                )
 
                 for matrix in matrices:
                     # In-place subtraction is safe as we're not using Autograd.
diff --git a/src/heretic/utils.py b/src/heretic/utils.py
index 6cb07fe..38a054f 100644
--- a/src/heretic/utils.py
+++ b/src/heretic/utils.py
@@ -84,7 +84,11 @@ def get_readme_intro(
 {
         chr(10).join(
             [
-                f"| **{name}** | {value:.4f} |"
+                (
+                    f"| **{name}** | {value:.4f} |"
+                    if isinstance(value, float)
+                    else f"| **{name}** | {value} |"
+                )
                 for name, value in study.best_params.items()
             ]
         )