Separate abliteration parameters for different layer components

This commit is contained in:
Philipp Emanuel Weidmann
2025-10-22 12:05:28 +05:30
parent ed65d6902b
commit 6359aa44bb
3 changed files with 131 additions and 122 deletions
+38 -51
View File
@@ -26,7 +26,7 @@ from rich.traceback import install
from .config import Settings from .config import Settings
from .evaluator import Evaluator from .evaluator import Evaluator
from .model import Model from .model import AbliterationParameters, Model
from .utils import get_readme_intro, load_prompts, print from .utils import get_readme_intro, load_prompts, print
@@ -175,13 +175,38 @@ def run():
trial_index += 1 trial_index += 1
trial.set_user_attr("index", trial_index) trial.set_user_attr("index", trial_index)
max_weight = trial.suggest_float("max_weight", 0, 1) parameters = {}
max_weight_position = trial.suggest_float(
"max_weight_position", 0, len(model.get_layers()) - 1 for component in model.get_abliterable_components():
# The parameter ranges are based on experiments with various models
# and much wider ranges. They are not set in stone and might have to be
# adjusted for future models.
max_weight = trial.suggest_float(
f"{component}.max_weight",
0.8,
1.2,
)
max_weight_position = trial.suggest_float(
f"{component}.max_weight_position",
0.6 * (len(model.get_layers()) - 1),
len(model.get_layers()) - 1,
)
min_weight = trial.suggest_float(
f"{component}.min_weight",
0.0,
max_weight,
) )
min_weight = trial.suggest_float("min_weight", 0, max_weight)
min_weight_distance = trial.suggest_float( min_weight_distance = trial.suggest_float(
"min_weight_distance", 1, len(model.get_layers()) - 1 f"{component}.min_weight_distance",
1.0,
0.6 * (len(model.get_layers()) - 1),
)
parameters[component] = AbliterationParameters(
max_weight=max_weight,
max_weight_position=max_weight_position,
min_weight=min_weight,
min_weight_distance=min_weight_distance,
) )
print() print()
@@ -189,50 +214,24 @@ def run():
f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..." f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
) )
print("* Parameters:") print("* Parameters:")
print(f" * max_weight = [bold]{max_weight:.4f}[/]") for name, value in trial.params.items():
print(f" * max_weight_position = [bold]{max_weight_position:.4f}[/]") print(f" * {name} = [bold]{value:.4f}[/]")
print(f" * min_weight = [bold]{min_weight:.4f}[/]")
print(f" * min_weight_distance = [bold]{min_weight_distance:.4f}[/]")
print("* Reloading model...") print("* Reloading model...")
model.reload_model() model.reload_model()
print("* Abliterating...") print("* Abliterating...")
model.abliterate( model.abliterate(refusal_directions, parameters)
refusal_directions,
max_weight,
max_weight_position,
min_weight,
min_weight_distance,
)
print("* Evaluating...") print("* Evaluating...")
score, kl_divergence, refusals = evaluator.get_score() score, kl_divergence, refusals = evaluator.get_score()
trial.set_user_attr("kl_divergence", kl_divergence) trial.set_user_attr("kl_divergence", kl_divergence)
trial.set_user_attr("refusals", refusals) trial.set_user_attr("refusals", refusals)
trial.set_user_attr("parameters", parameters)
# The optimizer searches for a minimum, so we return the negative score. # The optimizer searches for a minimum, so we return the negative score.
return -score return -score
study = optuna.create_study() study = optuna.create_study()
# Educated guesses for parameter values to get the optimizer started.
for max_weight, max_weight_position, min_weight, min_weight_distance in [
(0.0, 0.0, 0.0, 0.5),
(1.0, 0.5, 0.0, 0.25),
(0.8, 0.7, 0.3, 0.4),
(0.9, 0.3, 0.1, 0.1),
(1.0, 1.0, 1.0, 1.0),
]:
study.enqueue_trial(
{
"max_weight": max_weight,
"max_weight_position": max_weight_position
* (len(model.get_layers()) - 1),
"min_weight": min_weight,
"min_weight_distance": min_weight_distance
* (len(model.get_layers()) - 1),
}
)
study.optimize(objective, n_trials=settings.n_trials) study.optimize(objective, n_trials=settings.n_trials)
print() print()
@@ -240,14 +239,8 @@ def run():
f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:" f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
) )
print("* Parameters:") print("* Parameters:")
print(f" * max_weight = [bold]{study.best_params['max_weight']:.4f}[/]") for name, value in study.best_params.items():
print( print(f" * {name} = [bold]{value:.4f}[/]")
f" * max_weight_position = [bold]{study.best_params['max_weight_position']:.4f}[/]"
)
print(f" * min_weight = [bold]{study.best_params['min_weight']:.4f}[/]")
print(
f" * min_weight_distance = [bold]{study.best_params['min_weight_distance']:.4f}[/]"
)
print("* Results:") print("* Results:")
print( print(
f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]" f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
@@ -263,13 +256,7 @@ def run():
print("* Reloading model...") print("* Reloading model...")
model.reload_model() model.reload_model()
print("* Abliterating...") print("* Abliterating...")
model.abliterate( model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"])
refusal_directions,
study.best_params["max_weight"],
study.best_params["max_weight_position"],
study.best_params["min_weight"],
study.best_params["min_weight_distance"],
)
while True: while True:
print() print()
+41 -27
View File
@@ -2,6 +2,7 @@
# Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com> # Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com>
from contextlib import suppress from contextlib import suppress
from dataclasses import dataclass
from typing import Any from typing import Any
import torch import torch
@@ -21,6 +22,14 @@ from .config import Settings
from .utils import batchify, empty_cache, print from .utils import batchify, empty_cache, print
@dataclass
class AbliterationParameters:
max_weight: float
max_weight_position: float
min_weight: float
min_weight_distance: float
class Model: class Model:
def __init__(self, settings: Settings): def __init__(self, settings: Settings):
self.settings = settings self.settings = settings
@@ -61,8 +70,10 @@ class Model:
raise Exception("Failed to load model with all configured dtypes.") raise Exception("Failed to load model with all configured dtypes.")
print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers") print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers")
print("* Abliterable components:")
for component, matrices in self.get_layer_matrices(0).items():
print( print(
f"* [bold]{len(self.get_layer_matrices(0))}[/] abliterable matrices per layer" f" * [bold]{component}[/]: [bold]{len(matrices)}[/] matrices per layer"
) )
def reload_model(self): def reload_model(self):
@@ -86,71 +97,74 @@ class Model:
# Text-only models. # Text-only models.
return self.model.model.layers return self.model.model.layers
def get_layer_matrices(self, layer_index: int) -> list[torch.Tensor]: def get_layer_matrices(self, layer_index: int) -> dict[str, list[torch.Tensor]]:
layer = self.get_layers()[layer_index] layer = self.get_layers()[layer_index]
matrices = [] matrices = {}
def try_add(matrix: Any): def try_add(component: str, matrix: Any):
assert torch.is_tensor(matrix) assert torch.is_tensor(matrix)
matrices.append(matrix)
if component not in matrices:
matrices[component] = []
matrices[component].append(matrix)
# Exceptions aren't suppressed here, because there is currently
# no alternative location for the attention out-projection.
try_add("attn.o_proj", layer.self_attn.o_proj.weight)
# Most dense models. # Most dense models.
if not matrices:
with suppress(Exception): with suppress(Exception):
try_add(layer.mlp.down_proj.weight) try_add("mlp.down_proj", layer.mlp.down_proj.weight)
# Some MoE models (e.g. Qwen3). # Some MoE models (e.g. Qwen3).
if not matrices:
with suppress(Exception): with suppress(Exception):
for expert in layer.mlp.experts: for expert in layer.mlp.experts:
try_add(expert.down_proj.weight) try_add("mlp.down_proj", expert.down_proj.weight)
# Phi-3.5-MoE (and possibly others). # Phi-3.5-MoE (and possibly others).
if not matrices:
with suppress(Exception): with suppress(Exception):
for expert in layer.block_sparse_moe.experts: for expert in layer.block_sparse_moe.experts:
try_add(expert.w2.weight) try_add("mlp.down_proj", expert.w2.weight)
# gpt-oss MoE. # gpt-oss MoE.
if not matrices:
with suppress(Exception): with suppress(Exception):
# The implementation of gpt-oss in Transformers differs from many other MoE models # The implementation of gpt-oss in Transformers differs from many other MoE models
# in that it stores the down-projections for all experts in a single 3D tensor, # in that it stores the down-projections for all experts in a single 3D tensor,
# but thanks to PyTorch's broadcasting magic, it all just works anyway. # but thanks to PyTorch's broadcasting magic, it all just works anyway.
try_add(layer.mlp.experts.down_proj) try_add("mlp.down_proj", layer.mlp.experts.down_proj)
# We need at least one MLP down-projection. # We need at least one MLP down-projection.
assert matrices assert matrices["mlp.down_proj"]
# Exceptions aren't suppressed here, because there is currently
# no alternative location for the attention out-projection.
try_add(layer.self_attn.o_proj.weight)
return matrices return matrices
def get_abliterable_components(self) -> list[str]:
return list(self.get_layer_matrices(0).keys())
def abliterate( def abliterate(
self, self,
refusal_directions: torch.Tensor, refusal_directions: torch.Tensor,
max_weight: float, parameters: dict[str, AbliterationParameters],
max_weight_position: float,
min_weight: float,
min_weight_distance: float,
): ):
# Note that some implementations of abliteration also orthogonalize # Note that some implementations of abliteration also orthogonalize
# the embedding matrix, but it's unclear if that has any benefits. # the embedding matrix, but it's unclear if that has any benefits.
for layer_index in range(len(self.get_layers())): for layer_index in range(len(self.get_layers())):
distance = abs(layer_index - max_weight_position) for component, matrices in self.get_layer_matrices(layer_index).items():
params = parameters[component]
distance = abs(layer_index - params.max_weight_position)
# Don't orthogonalize layers that are more than # Don't orthogonalize layers that are more than
# min_weight_distance away from max_weight_position. # min_weight_distance away from max_weight_position.
if distance > min_weight_distance: if distance > params.min_weight_distance:
continue continue
# Interpolate linearly between max_weight and min_weight # Interpolate linearly between max_weight and min_weight
# over min_weight_distance. # over min_weight_distance.
weight = max_weight + (distance / min_weight_distance) * ( weight = params.max_weight + (distance / params.min_weight_distance) * (
min_weight - max_weight params.min_weight - params.max_weight
) )
# The index must be shifted by 1 because the first element # The index must be shifted by 1 because the first element
@@ -161,7 +175,7 @@ class Model:
# spanned by the refusal direction. # spanned by the refusal direction.
projector = torch.outer(refusal_direction, refusal_direction) projector = torch.outer(refusal_direction, refusal_direction)
for matrix in self.get_layer_matrices(layer_index): for matrix in matrices:
# In-place subtraction is safe as we're not using Autograd. # In-place subtraction is safe as we're not using Autograd.
matrix.sub_(weight * (projector @ matrix)) matrix.sub_(weight * (projector @ matrix))
+17 -9
View File
@@ -54,27 +54,35 @@ def get_readme_intro(
base_refusals: int, base_refusals: int,
bad_prompts: list[str], bad_prompts: list[str],
) -> str: ) -> str:
model_link = f"[{settings.model}](https://huggingface.co/{settings.model})"
refusal_percentage = ( refusal_percentage = (
study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100 study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100
) )
base_refusal_percentage = base_refusals / len(bad_prompts) * 100 base_refusal_percentage = base_refusals / len(bad_prompts) * 100
return f"""# This is a decensored version of [{settings.model}](https://huggingface.co/{settings.model}), made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")} return f"""# This is a decensored version of {
model_link
}, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
## Abliteration parameters ## Abliteration parameters
| Parameter | Value | | Parameter | Value |
| :---------------------- | :--------------------------------------------: | | :-------- | :---: |
| **max_weight** | {study.best_params["max_weight"]:.4f} | {
| **max_weight_position** | {study.best_params["max_weight_position"]:.4f} | chr(10).join(
| **min_weight** | {study.best_params["min_weight"]:.4f} | [
| **min_weight_distance** | {study.best_params["min_weight_distance"]:.4f} | f"| **{name}** | {value:.4f} |"
for name, value in study.best_params.items()
]
)
}
## Performance ## Performance
| Metric | This model | Original model ([{settings.model}](https://huggingface.co/{settings.model})) | | Metric | This model | Original model ({model_link}) |
| :---------------- | :------------------------------------------------: | :--------------------------------------------------------------------------: | | :----- | :--------: | :---------------------------: |
| **KL divergence** | {study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* | | **KL divergence** | {
study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % | | **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
----- -----