From 1496e0a04c3a782d34e1b4be1f637c871c6923be Mon Sep 17 00:00:00 2001 From: Philipp Emanuel Weidmann Date: Fri, 31 Oct 2025 13:04:45 +0530 Subject: [PATCH] Dynamically choose between global and per-layer refusal directions --- src/heretic/main.py | 40 +++++++++++++++++++++++++++++++++++----- src/heretic/model.py | 27 +++++++++++++++++++++++---- src/heretic/utils.py | 6 +++++- 3 files changed, 63 insertions(+), 10 deletions(-) diff --git a/src/heretic/main.py b/src/heretic/main.py index 98da8dc..b96aaeb 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -185,6 +185,26 @@ def run(): trial_index += 1 trial.set_user_attr("index", trial_index) + direction_scope = trial.suggest_categorical( + "direction_scope", + [ + "global", + "per layer", + ], + ) + + if direction_scope == "global": + # Discrimination between "harmful" and "harmless" inputs is usually strongest + # in layers slightly past the midpoint of the layer stack. See the original + # abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis. + direction_index = trial.suggest_float( + "direction_index", + 0.4 * (len(model.get_layers()) - 1), + 0.9 * (len(model.get_layers()) - 1), + ) + else: + direction_index = None + parameters = {} for component in model.get_abliterable_components(): @@ -194,7 +214,7 @@ def run(): max_weight = trial.suggest_float( f"{component}.max_weight", 0.8, - 1.2, + 1.5, ) max_weight_position = trial.suggest_float( f"{component}.max_weight_position", @@ -225,11 +245,14 @@ def run(): ) print("* Parameters:") for name, value in trial.params.items(): - print(f" * {name} = [bold]{value:.4f}[/]") + if isinstance(value, float): + print(f" * {name} = [bold]{value:.4f}[/]") + else: + print(f" * {name} = [bold]{value}[/]") print("* Reloading model...") model.reload_model() print("* Abliterating...") - model.abliterate(refusal_directions, parameters) + model.abliterate(refusal_directions, direction_index, parameters) print("* Evaluating...") score, kl_divergence, refusals = evaluator.get_score() @@ -261,7 +284,10 @@ def run(): ) print("* Parameters:") for name, value in study.best_params.items(): - print(f" * {name} = [bold]{value:.4f}[/]") + if isinstance(value, float): + print(f" * {name} = [bold]{value:.4f}[/]") + else: + print(f" * {name} = [bold]{value}[/]") print("* Results:") print( f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]" @@ -277,7 +303,11 @@ def run(): print("* Reloading model...") model.reload_model() print("* Abliterating...") - model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"]) + model.abliterate( + refusal_directions, + study.best_params.get("direction_index", None), + study.best_trial.user_attrs["parameters"], + ) while True: print() diff --git a/src/heretic/model.py b/src/heretic/model.py index 2042172..6419550 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # Copyright (C) 2025 Philipp Emanuel Weidmann +import math from contextlib import suppress from dataclasses import dataclass from typing import Any @@ -146,8 +147,20 @@ class Model: def abliterate( self, refusal_directions: torch.Tensor, + direction_index: float | None, parameters: dict[str, AbliterationParameters], ): + if direction_index is None: + refusal_direction = None + else: + # The index must be shifted by 1 because the first element + # of refusal_directions is the direction for the embeddings. + weight, index = math.modf(direction_index + 1) + refusal_direction = refusal_directions[int(index)].lerp( + refusal_directions[int(index) + 1], + weight, + ) + # Note that some implementations of abliteration also orthogonalize # the embedding matrix, but it's unclear if that has any benefits. for layer_index in range(len(self.get_layers())): @@ -167,13 +180,19 @@ class Model: params.min_weight - params.max_weight ) - # The index must be shifted by 1 because the first element - # of refusal_directions is the direction for the embeddings. - refusal_direction = refusal_directions[layer_index + 1] + if refusal_direction is None: + # The index must be shifted by 1 because the first element + # of refusal_directions is the direction for the embeddings. + layer_refusal_direction = refusal_directions[layer_index + 1] + else: + layer_refusal_direction = refusal_direction # Projects any right-multiplied vector(s) onto the subspace # spanned by the refusal direction. - projector = torch.outer(refusal_direction, refusal_direction) + projector = torch.outer( + layer_refusal_direction, + layer_refusal_direction, + ) for matrix in matrices: # In-place subtraction is safe as we're not using Autograd. diff --git a/src/heretic/utils.py b/src/heretic/utils.py index 6cb07fe..38a054f 100644 --- a/src/heretic/utils.py +++ b/src/heretic/utils.py @@ -84,7 +84,11 @@ def get_readme_intro( { chr(10).join( [ - f"| **{name}** | {value:.4f} |" + ( + f"| **{name}** | {value:.4f} |" + if isinstance(value, float) + else f"| **{name}** | {value} |" + ) for name, value in study.best_params.items() ] )