Separate abliteration parameters for different layer components
This commit is contained in:
+41
-54
@@ -26,7 +26,7 @@ from rich.traceback import install
|
||||
|
||||
from .config import Settings
|
||||
from .evaluator import Evaluator
|
||||
from .model import Model
|
||||
from .model import AbliterationParameters, Model
|
||||
from .utils import get_readme_intro, load_prompts, print
|
||||
|
||||
|
||||
@@ -175,64 +175,63 @@ def run():
|
||||
trial_index += 1
|
||||
trial.set_user_attr("index", trial_index)
|
||||
|
||||
max_weight = trial.suggest_float("max_weight", 0, 1)
|
||||
max_weight_position = trial.suggest_float(
|
||||
"max_weight_position", 0, len(model.get_layers()) - 1
|
||||
)
|
||||
min_weight = trial.suggest_float("min_weight", 0, max_weight)
|
||||
min_weight_distance = trial.suggest_float(
|
||||
"min_weight_distance", 1, len(model.get_layers()) - 1
|
||||
)
|
||||
parameters = {}
|
||||
|
||||
for component in model.get_abliterable_components():
|
||||
# The parameter ranges are based on experiments with various models
|
||||
# and much wider ranges. They are not set in stone and might have to be
|
||||
# adjusted for future models.
|
||||
max_weight = trial.suggest_float(
|
||||
f"{component}.max_weight",
|
||||
0.8,
|
||||
1.2,
|
||||
)
|
||||
max_weight_position = trial.suggest_float(
|
||||
f"{component}.max_weight_position",
|
||||
0.6 * (len(model.get_layers()) - 1),
|
||||
len(model.get_layers()) - 1,
|
||||
)
|
||||
min_weight = trial.suggest_float(
|
||||
f"{component}.min_weight",
|
||||
0.0,
|
||||
max_weight,
|
||||
)
|
||||
min_weight_distance = trial.suggest_float(
|
||||
f"{component}.min_weight_distance",
|
||||
1.0,
|
||||
0.6 * (len(model.get_layers()) - 1),
|
||||
)
|
||||
|
||||
parameters[component] = AbliterationParameters(
|
||||
max_weight=max_weight,
|
||||
max_weight_position=max_weight_position,
|
||||
min_weight=min_weight,
|
||||
min_weight_distance=min_weight_distance,
|
||||
)
|
||||
|
||||
print()
|
||||
print(
|
||||
f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
|
||||
)
|
||||
print("* Parameters:")
|
||||
print(f" * max_weight = [bold]{max_weight:.4f}[/]")
|
||||
print(f" * max_weight_position = [bold]{max_weight_position:.4f}[/]")
|
||||
print(f" * min_weight = [bold]{min_weight:.4f}[/]")
|
||||
print(f" * min_weight_distance = [bold]{min_weight_distance:.4f}[/]")
|
||||
for name, value in trial.params.items():
|
||||
print(f" * {name} = [bold]{value:.4f}[/]")
|
||||
print("* Reloading model...")
|
||||
model.reload_model()
|
||||
print("* Abliterating...")
|
||||
model.abliterate(
|
||||
refusal_directions,
|
||||
max_weight,
|
||||
max_weight_position,
|
||||
min_weight,
|
||||
min_weight_distance,
|
||||
)
|
||||
model.abliterate(refusal_directions, parameters)
|
||||
print("* Evaluating...")
|
||||
score, kl_divergence, refusals = evaluator.get_score()
|
||||
|
||||
trial.set_user_attr("kl_divergence", kl_divergence)
|
||||
trial.set_user_attr("refusals", refusals)
|
||||
trial.set_user_attr("parameters", parameters)
|
||||
|
||||
# The optimizer searches for a minimum, so we return the negative score.
|
||||
return -score
|
||||
|
||||
study = optuna.create_study()
|
||||
|
||||
# Educated guesses for parameter values to get the optimizer started.
|
||||
for max_weight, max_weight_position, min_weight, min_weight_distance in [
|
||||
(0.0, 0.0, 0.0, 0.5),
|
||||
(1.0, 0.5, 0.0, 0.25),
|
||||
(0.8, 0.7, 0.3, 0.4),
|
||||
(0.9, 0.3, 0.1, 0.1),
|
||||
(1.0, 1.0, 1.0, 1.0),
|
||||
]:
|
||||
study.enqueue_trial(
|
||||
{
|
||||
"max_weight": max_weight,
|
||||
"max_weight_position": max_weight_position
|
||||
* (len(model.get_layers()) - 1),
|
||||
"min_weight": min_weight,
|
||||
"min_weight_distance": min_weight_distance
|
||||
* (len(model.get_layers()) - 1),
|
||||
}
|
||||
)
|
||||
|
||||
study.optimize(objective, n_trials=settings.n_trials)
|
||||
|
||||
print()
|
||||
@@ -240,14 +239,8 @@ def run():
|
||||
f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
|
||||
)
|
||||
print("* Parameters:")
|
||||
print(f" * max_weight = [bold]{study.best_params['max_weight']:.4f}[/]")
|
||||
print(
|
||||
f" * max_weight_position = [bold]{study.best_params['max_weight_position']:.4f}[/]"
|
||||
)
|
||||
print(f" * min_weight = [bold]{study.best_params['min_weight']:.4f}[/]")
|
||||
print(
|
||||
f" * min_weight_distance = [bold]{study.best_params['min_weight_distance']:.4f}[/]"
|
||||
)
|
||||
for name, value in study.best_params.items():
|
||||
print(f" * {name} = [bold]{value:.4f}[/]")
|
||||
print("* Results:")
|
||||
print(
|
||||
f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
|
||||
@@ -263,13 +256,7 @@ def run():
|
||||
print("* Reloading model...")
|
||||
model.reload_model()
|
||||
print("* Abliterating...")
|
||||
model.abliterate(
|
||||
refusal_directions,
|
||||
study.best_params["max_weight"],
|
||||
study.best_params["max_weight_position"],
|
||||
study.best_params["min_weight"],
|
||||
study.best_params["min_weight_distance"],
|
||||
)
|
||||
model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"])
|
||||
|
||||
while True:
|
||||
print()
|
||||
|
||||
+71
-57
@@ -2,6 +2,7 @@
|
||||
# Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com>
|
||||
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
@@ -21,6 +22,14 @@ from .config import Settings
|
||||
from .utils import batchify, empty_cache, print
|
||||
|
||||
|
||||
@dataclass
|
||||
class AbliterationParameters:
|
||||
max_weight: float
|
||||
max_weight_position: float
|
||||
min_weight: float
|
||||
min_weight_distance: float
|
||||
|
||||
|
||||
class Model:
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
@@ -61,9 +70,11 @@ class Model:
|
||||
raise Exception("Failed to load model with all configured dtypes.")
|
||||
|
||||
print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers")
|
||||
print(
|
||||
f"* [bold]{len(self.get_layer_matrices(0))}[/] abliterable matrices per layer"
|
||||
)
|
||||
print("* Abliterable components:")
|
||||
for component, matrices in self.get_layer_matrices(0).items():
|
||||
print(
|
||||
f" * [bold]{component}[/]: [bold]{len(matrices)}[/] matrices per layer"
|
||||
)
|
||||
|
||||
def reload_model(self):
|
||||
dtype = self.model.dtype
|
||||
@@ -86,84 +97,87 @@ class Model:
|
||||
# Text-only models.
|
||||
return self.model.model.layers
|
||||
|
||||
def get_layer_matrices(self, layer_index: int) -> list[torch.Tensor]:
|
||||
def get_layer_matrices(self, layer_index: int) -> dict[str, list[torch.Tensor]]:
|
||||
layer = self.get_layers()[layer_index]
|
||||
|
||||
matrices = []
|
||||
matrices = {}
|
||||
|
||||
def try_add(matrix: Any):
|
||||
def try_add(component: str, matrix: Any):
|
||||
assert torch.is_tensor(matrix)
|
||||
matrices.append(matrix)
|
||||
|
||||
# Most dense models.
|
||||
if not matrices:
|
||||
with suppress(Exception):
|
||||
try_add(layer.mlp.down_proj.weight)
|
||||
if component not in matrices:
|
||||
matrices[component] = []
|
||||
|
||||
# Some MoE models (e.g. Qwen3).
|
||||
if not matrices:
|
||||
with suppress(Exception):
|
||||
for expert in layer.mlp.experts:
|
||||
try_add(expert.down_proj.weight)
|
||||
|
||||
# Phi-3.5-MoE (and possibly others).
|
||||
if not matrices:
|
||||
with suppress(Exception):
|
||||
for expert in layer.block_sparse_moe.experts:
|
||||
try_add(expert.w2.weight)
|
||||
|
||||
# gpt-oss MoE.
|
||||
if not matrices:
|
||||
with suppress(Exception):
|
||||
# The implementation of gpt-oss in Transformers differs from many other MoE models
|
||||
# in that it stores the down-projections for all experts in a single 3D tensor,
|
||||
# but thanks to PyTorch's broadcasting magic, it all just works anyway.
|
||||
try_add(layer.mlp.experts.down_proj)
|
||||
|
||||
# We need at least one MLP down-projection.
|
||||
assert matrices
|
||||
matrices[component].append(matrix)
|
||||
|
||||
# Exceptions aren't suppressed here, because there is currently
|
||||
# no alternative location for the attention out-projection.
|
||||
try_add(layer.self_attn.o_proj.weight)
|
||||
try_add("attn.o_proj", layer.self_attn.o_proj.weight)
|
||||
|
||||
# Most dense models.
|
||||
with suppress(Exception):
|
||||
try_add("mlp.down_proj", layer.mlp.down_proj.weight)
|
||||
|
||||
# Some MoE models (e.g. Qwen3).
|
||||
with suppress(Exception):
|
||||
for expert in layer.mlp.experts:
|
||||
try_add("mlp.down_proj", expert.down_proj.weight)
|
||||
|
||||
# Phi-3.5-MoE (and possibly others).
|
||||
with suppress(Exception):
|
||||
for expert in layer.block_sparse_moe.experts:
|
||||
try_add("mlp.down_proj", expert.w2.weight)
|
||||
|
||||
# gpt-oss MoE.
|
||||
with suppress(Exception):
|
||||
# The implementation of gpt-oss in Transformers differs from many other MoE models
|
||||
# in that it stores the down-projections for all experts in a single 3D tensor,
|
||||
# but thanks to PyTorch's broadcasting magic, it all just works anyway.
|
||||
try_add("mlp.down_proj", layer.mlp.experts.down_proj)
|
||||
|
||||
# We need at least one MLP down-projection.
|
||||
assert matrices["mlp.down_proj"]
|
||||
|
||||
return matrices
|
||||
|
||||
def get_abliterable_components(self) -> list[str]:
|
||||
return list(self.get_layer_matrices(0).keys())
|
||||
|
||||
def abliterate(
|
||||
self,
|
||||
refusal_directions: torch.Tensor,
|
||||
max_weight: float,
|
||||
max_weight_position: float,
|
||||
min_weight: float,
|
||||
min_weight_distance: float,
|
||||
parameters: dict[str, AbliterationParameters],
|
||||
):
|
||||
# Note that some implementations of abliteration also orthogonalize
|
||||
# the embedding matrix, but it's unclear if that has any benefits.
|
||||
for layer_index in range(len(self.get_layers())):
|
||||
distance = abs(layer_index - max_weight_position)
|
||||
for component, matrices in self.get_layer_matrices(layer_index).items():
|
||||
params = parameters[component]
|
||||
|
||||
# Don't orthogonalize layers that are more than
|
||||
# min_weight_distance away from max_weight_position.
|
||||
if distance > min_weight_distance:
|
||||
continue
|
||||
distance = abs(layer_index - params.max_weight_position)
|
||||
|
||||
# Interpolate linearly between max_weight and min_weight
|
||||
# over min_weight_distance.
|
||||
weight = max_weight + (distance / min_weight_distance) * (
|
||||
min_weight - max_weight
|
||||
)
|
||||
# Don't orthogonalize layers that are more than
|
||||
# min_weight_distance away from max_weight_position.
|
||||
if distance > params.min_weight_distance:
|
||||
continue
|
||||
|
||||
# The index must be shifted by 1 because the first element
|
||||
# of refusal_directions is the direction for the embeddings.
|
||||
refusal_direction = refusal_directions[layer_index + 1]
|
||||
# Interpolate linearly between max_weight and min_weight
|
||||
# over min_weight_distance.
|
||||
weight = params.max_weight + (distance / params.min_weight_distance) * (
|
||||
params.min_weight - params.max_weight
|
||||
)
|
||||
|
||||
# Projects any right-multiplied vector(s) onto the subspace
|
||||
# spanned by the refusal direction.
|
||||
projector = torch.outer(refusal_direction, refusal_direction)
|
||||
# The index must be shifted by 1 because the first element
|
||||
# of refusal_directions is the direction for the embeddings.
|
||||
refusal_direction = refusal_directions[layer_index + 1]
|
||||
|
||||
for matrix in self.get_layer_matrices(layer_index):
|
||||
# In-place subtraction is safe as we're not using Autograd.
|
||||
matrix.sub_(weight * (projector @ matrix))
|
||||
# Projects any right-multiplied vector(s) onto the subspace
|
||||
# spanned by the refusal direction.
|
||||
projector = torch.outer(refusal_direction, refusal_direction)
|
||||
|
||||
for matrix in matrices:
|
||||
# In-place subtraction is safe as we're not using Autograd.
|
||||
matrix.sub_(weight * (projector @ matrix))
|
||||
|
||||
def get_chat(self, prompt: str) -> list[dict[str, str]]:
|
||||
return [
|
||||
|
||||
+19
-11
@@ -54,28 +54,36 @@ def get_readme_intro(
|
||||
base_refusals: int,
|
||||
bad_prompts: list[str],
|
||||
) -> str:
|
||||
model_link = f"[{settings.model}](https://huggingface.co/{settings.model})"
|
||||
refusal_percentage = (
|
||||
study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100
|
||||
)
|
||||
base_refusal_percentage = base_refusals / len(bad_prompts) * 100
|
||||
|
||||
return f"""# This is a decensored version of [{settings.model}](https://huggingface.co/{settings.model}), made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
|
||||
return f"""# This is a decensored version of {
|
||||
model_link
|
||||
}, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
|
||||
|
||||
## Abliteration parameters
|
||||
|
||||
| Parameter | Value |
|
||||
| :---------------------- | :--------------------------------------------: |
|
||||
| **max_weight** | {study.best_params["max_weight"]:.4f} |
|
||||
| **max_weight_position** | {study.best_params["max_weight_position"]:.4f} |
|
||||
| **min_weight** | {study.best_params["min_weight"]:.4f} |
|
||||
| **min_weight_distance** | {study.best_params["min_weight_distance"]:.4f} |
|
||||
| Parameter | Value |
|
||||
| :-------- | :---: |
|
||||
{
|
||||
chr(10).join(
|
||||
[
|
||||
f"| **{name}** | {value:.4f} |"
|
||||
for name, value in study.best_params.items()
|
||||
]
|
||||
)
|
||||
}
|
||||
|
||||
## Performance
|
||||
|
||||
| Metric | This model | Original model ([{settings.model}](https://huggingface.co/{settings.model})) |
|
||||
| :---------------- | :------------------------------------------------: | :--------------------------------------------------------------------------: |
|
||||
| **KL divergence** | {study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
|
||||
| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
|
||||
| Metric | This model | Original model ({model_link}) |
|
||||
| :----- | :--------: | :---------------------------: |
|
||||
| **KL divergence** | {
|
||||
study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
|
||||
| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
|
||||
|
||||
-----
|
||||
|
||||
|
||||
Reference in New Issue
Block a user