Separate abliteration parameters for different layer components

This commit is contained in:
Philipp Emanuel Weidmann
2025-10-22 12:05:28 +05:30
parent ed65d6902b
commit 6359aa44bb
3 changed files with 131 additions and 122 deletions
+41 -54
View File
@@ -26,7 +26,7 @@ from rich.traceback import install
from .config import Settings
from .evaluator import Evaluator
from .model import Model
from .model import AbliterationParameters, Model
from .utils import get_readme_intro, load_prompts, print
@@ -175,64 +175,63 @@ def run():
trial_index += 1
trial.set_user_attr("index", trial_index)
max_weight = trial.suggest_float("max_weight", 0, 1)
max_weight_position = trial.suggest_float(
"max_weight_position", 0, len(model.get_layers()) - 1
)
min_weight = trial.suggest_float("min_weight", 0, max_weight)
min_weight_distance = trial.suggest_float(
"min_weight_distance", 1, len(model.get_layers()) - 1
)
parameters = {}
for component in model.get_abliterable_components():
# The parameter ranges are based on experiments with various models
# and much wider ranges. They are not set in stone and might have to be
# adjusted for future models.
max_weight = trial.suggest_float(
f"{component}.max_weight",
0.8,
1.2,
)
max_weight_position = trial.suggest_float(
f"{component}.max_weight_position",
0.6 * (len(model.get_layers()) - 1),
len(model.get_layers()) - 1,
)
min_weight = trial.suggest_float(
f"{component}.min_weight",
0.0,
max_weight,
)
min_weight_distance = trial.suggest_float(
f"{component}.min_weight_distance",
1.0,
0.6 * (len(model.get_layers()) - 1),
)
parameters[component] = AbliterationParameters(
max_weight=max_weight,
max_weight_position=max_weight_position,
min_weight=min_weight,
min_weight_distance=min_weight_distance,
)
print()
print(
f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
)
print("* Parameters:")
print(f" * max_weight = [bold]{max_weight:.4f}[/]")
print(f" * max_weight_position = [bold]{max_weight_position:.4f}[/]")
print(f" * min_weight = [bold]{min_weight:.4f}[/]")
print(f" * min_weight_distance = [bold]{min_weight_distance:.4f}[/]")
for name, value in trial.params.items():
print(f" * {name} = [bold]{value:.4f}[/]")
print("* Reloading model...")
model.reload_model()
print("* Abliterating...")
model.abliterate(
refusal_directions,
max_weight,
max_weight_position,
min_weight,
min_weight_distance,
)
model.abliterate(refusal_directions, parameters)
print("* Evaluating...")
score, kl_divergence, refusals = evaluator.get_score()
trial.set_user_attr("kl_divergence", kl_divergence)
trial.set_user_attr("refusals", refusals)
trial.set_user_attr("parameters", parameters)
# The optimizer searches for a minimum, so we return the negative score.
return -score
study = optuna.create_study()
# Educated guesses for parameter values to get the optimizer started.
for max_weight, max_weight_position, min_weight, min_weight_distance in [
(0.0, 0.0, 0.0, 0.5),
(1.0, 0.5, 0.0, 0.25),
(0.8, 0.7, 0.3, 0.4),
(0.9, 0.3, 0.1, 0.1),
(1.0, 1.0, 1.0, 1.0),
]:
study.enqueue_trial(
{
"max_weight": max_weight,
"max_weight_position": max_weight_position
* (len(model.get_layers()) - 1),
"min_weight": min_weight,
"min_weight_distance": min_weight_distance
* (len(model.get_layers()) - 1),
}
)
study.optimize(objective, n_trials=settings.n_trials)
print()
@@ -240,14 +239,8 @@ def run():
f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
)
print("* Parameters:")
print(f" * max_weight = [bold]{study.best_params['max_weight']:.4f}[/]")
print(
f" * max_weight_position = [bold]{study.best_params['max_weight_position']:.4f}[/]"
)
print(f" * min_weight = [bold]{study.best_params['min_weight']:.4f}[/]")
print(
f" * min_weight_distance = [bold]{study.best_params['min_weight_distance']:.4f}[/]"
)
for name, value in study.best_params.items():
print(f" * {name} = [bold]{value:.4f}[/]")
print("* Results:")
print(
f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
@@ -263,13 +256,7 @@ def run():
print("* Reloading model...")
model.reload_model()
print("* Abliterating...")
model.abliterate(
refusal_directions,
study.best_params["max_weight"],
study.best_params["max_weight_position"],
study.best_params["min_weight"],
study.best_params["min_weight_distance"],
)
model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"])
while True:
print()
+71 -57
View File
@@ -2,6 +2,7 @@
# Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com>
from contextlib import suppress
from dataclasses import dataclass
from typing import Any
import torch
@@ -21,6 +22,14 @@ from .config import Settings
from .utils import batchify, empty_cache, print
@dataclass
class AbliterationParameters:
max_weight: float
max_weight_position: float
min_weight: float
min_weight_distance: float
class Model:
def __init__(self, settings: Settings):
self.settings = settings
@@ -61,9 +70,11 @@ class Model:
raise Exception("Failed to load model with all configured dtypes.")
print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers")
print(
f"* [bold]{len(self.get_layer_matrices(0))}[/] abliterable matrices per layer"
)
print("* Abliterable components:")
for component, matrices in self.get_layer_matrices(0).items():
print(
f" * [bold]{component}[/]: [bold]{len(matrices)}[/] matrices per layer"
)
def reload_model(self):
dtype = self.model.dtype
@@ -86,84 +97,87 @@ class Model:
# Text-only models.
return self.model.model.layers
def get_layer_matrices(self, layer_index: int) -> list[torch.Tensor]:
def get_layer_matrices(self, layer_index: int) -> dict[str, list[torch.Tensor]]:
layer = self.get_layers()[layer_index]
matrices = []
matrices = {}
def try_add(matrix: Any):
def try_add(component: str, matrix: Any):
assert torch.is_tensor(matrix)
matrices.append(matrix)
# Most dense models.
if not matrices:
with suppress(Exception):
try_add(layer.mlp.down_proj.weight)
if component not in matrices:
matrices[component] = []
# Some MoE models (e.g. Qwen3).
if not matrices:
with suppress(Exception):
for expert in layer.mlp.experts:
try_add(expert.down_proj.weight)
# Phi-3.5-MoE (and possibly others).
if not matrices:
with suppress(Exception):
for expert in layer.block_sparse_moe.experts:
try_add(expert.w2.weight)
# gpt-oss MoE.
if not matrices:
with suppress(Exception):
# The implementation of gpt-oss in Transformers differs from many other MoE models
# in that it stores the down-projections for all experts in a single 3D tensor,
# but thanks to PyTorch's broadcasting magic, it all just works anyway.
try_add(layer.mlp.experts.down_proj)
# We need at least one MLP down-projection.
assert matrices
matrices[component].append(matrix)
# Exceptions aren't suppressed here, because there is currently
# no alternative location for the attention out-projection.
try_add(layer.self_attn.o_proj.weight)
try_add("attn.o_proj", layer.self_attn.o_proj.weight)
# Most dense models.
with suppress(Exception):
try_add("mlp.down_proj", layer.mlp.down_proj.weight)
# Some MoE models (e.g. Qwen3).
with suppress(Exception):
for expert in layer.mlp.experts:
try_add("mlp.down_proj", expert.down_proj.weight)
# Phi-3.5-MoE (and possibly others).
with suppress(Exception):
for expert in layer.block_sparse_moe.experts:
try_add("mlp.down_proj", expert.w2.weight)
# gpt-oss MoE.
with suppress(Exception):
# The implementation of gpt-oss in Transformers differs from many other MoE models
# in that it stores the down-projections for all experts in a single 3D tensor,
# but thanks to PyTorch's broadcasting magic, it all just works anyway.
try_add("mlp.down_proj", layer.mlp.experts.down_proj)
# We need at least one MLP down-projection.
assert matrices["mlp.down_proj"]
return matrices
def get_abliterable_components(self) -> list[str]:
return list(self.get_layer_matrices(0).keys())
def abliterate(
self,
refusal_directions: torch.Tensor,
max_weight: float,
max_weight_position: float,
min_weight: float,
min_weight_distance: float,
parameters: dict[str, AbliterationParameters],
):
# Note that some implementations of abliteration also orthogonalize
# the embedding matrix, but it's unclear if that has any benefits.
for layer_index in range(len(self.get_layers())):
distance = abs(layer_index - max_weight_position)
for component, matrices in self.get_layer_matrices(layer_index).items():
params = parameters[component]
# Don't orthogonalize layers that are more than
# min_weight_distance away from max_weight_position.
if distance > min_weight_distance:
continue
distance = abs(layer_index - params.max_weight_position)
# Interpolate linearly between max_weight and min_weight
# over min_weight_distance.
weight = max_weight + (distance / min_weight_distance) * (
min_weight - max_weight
)
# Don't orthogonalize layers that are more than
# min_weight_distance away from max_weight_position.
if distance > params.min_weight_distance:
continue
# The index must be shifted by 1 because the first element
# of refusal_directions is the direction for the embeddings.
refusal_direction = refusal_directions[layer_index + 1]
# Interpolate linearly between max_weight and min_weight
# over min_weight_distance.
weight = params.max_weight + (distance / params.min_weight_distance) * (
params.min_weight - params.max_weight
)
# Projects any right-multiplied vector(s) onto the subspace
# spanned by the refusal direction.
projector = torch.outer(refusal_direction, refusal_direction)
# The index must be shifted by 1 because the first element
# of refusal_directions is the direction for the embeddings.
refusal_direction = refusal_directions[layer_index + 1]
for matrix in self.get_layer_matrices(layer_index):
# In-place subtraction is safe as we're not using Autograd.
matrix.sub_(weight * (projector @ matrix))
# Projects any right-multiplied vector(s) onto the subspace
# spanned by the refusal direction.
projector = torch.outer(refusal_direction, refusal_direction)
for matrix in matrices:
# In-place subtraction is safe as we're not using Autograd.
matrix.sub_(weight * (projector @ matrix))
def get_chat(self, prompt: str) -> list[dict[str, str]]:
return [
+19 -11
View File
@@ -54,28 +54,36 @@ def get_readme_intro(
base_refusals: int,
bad_prompts: list[str],
) -> str:
model_link = f"[{settings.model}](https://huggingface.co/{settings.model})"
refusal_percentage = (
study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100
)
base_refusal_percentage = base_refusals / len(bad_prompts) * 100
return f"""# This is a decensored version of [{settings.model}](https://huggingface.co/{settings.model}), made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
return f"""# This is a decensored version of {
model_link
}, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
## Abliteration parameters
| Parameter | Value |
| :---------------------- | :--------------------------------------------: |
| **max_weight** | {study.best_params["max_weight"]:.4f} |
| **max_weight_position** | {study.best_params["max_weight_position"]:.4f} |
| **min_weight** | {study.best_params["min_weight"]:.4f} |
| **min_weight_distance** | {study.best_params["min_weight_distance"]:.4f} |
| Parameter | Value |
| :-------- | :---: |
{
chr(10).join(
[
f"| **{name}** | {value:.4f} |"
for name, value in study.best_params.items()
]
)
}
## Performance
| Metric | This model | Original model ([{settings.model}](https://huggingface.co/{settings.model})) |
| :---------------- | :------------------------------------------------: | :--------------------------------------------------------------------------: |
| **KL divergence** | {study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
| Metric | This model | Original model ({model_link}) |
| :----- | :--------: | :---------------------------: |
| **KL divergence** | {
study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
-----