Separate abliteration parameters for different layer components
This commit is contained in:
+38
-51
@@ -26,7 +26,7 @@ from rich.traceback import install
|
|||||||
|
|
||||||
from .config import Settings
|
from .config import Settings
|
||||||
from .evaluator import Evaluator
|
from .evaluator import Evaluator
|
||||||
from .model import Model
|
from .model import AbliterationParameters, Model
|
||||||
from .utils import get_readme_intro, load_prompts, print
|
from .utils import get_readme_intro, load_prompts, print
|
||||||
|
|
||||||
|
|
||||||
@@ -175,13 +175,38 @@ def run():
|
|||||||
trial_index += 1
|
trial_index += 1
|
||||||
trial.set_user_attr("index", trial_index)
|
trial.set_user_attr("index", trial_index)
|
||||||
|
|
||||||
max_weight = trial.suggest_float("max_weight", 0, 1)
|
parameters = {}
|
||||||
max_weight_position = trial.suggest_float(
|
|
||||||
"max_weight_position", 0, len(model.get_layers()) - 1
|
for component in model.get_abliterable_components():
|
||||||
|
# The parameter ranges are based on experiments with various models
|
||||||
|
# and much wider ranges. They are not set in stone and might have to be
|
||||||
|
# adjusted for future models.
|
||||||
|
max_weight = trial.suggest_float(
|
||||||
|
f"{component}.max_weight",
|
||||||
|
0.8,
|
||||||
|
1.2,
|
||||||
|
)
|
||||||
|
max_weight_position = trial.suggest_float(
|
||||||
|
f"{component}.max_weight_position",
|
||||||
|
0.6 * (len(model.get_layers()) - 1),
|
||||||
|
len(model.get_layers()) - 1,
|
||||||
|
)
|
||||||
|
min_weight = trial.suggest_float(
|
||||||
|
f"{component}.min_weight",
|
||||||
|
0.0,
|
||||||
|
max_weight,
|
||||||
)
|
)
|
||||||
min_weight = trial.suggest_float("min_weight", 0, max_weight)
|
|
||||||
min_weight_distance = trial.suggest_float(
|
min_weight_distance = trial.suggest_float(
|
||||||
"min_weight_distance", 1, len(model.get_layers()) - 1
|
f"{component}.min_weight_distance",
|
||||||
|
1.0,
|
||||||
|
0.6 * (len(model.get_layers()) - 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
parameters[component] = AbliterationParameters(
|
||||||
|
max_weight=max_weight,
|
||||||
|
max_weight_position=max_weight_position,
|
||||||
|
min_weight=min_weight,
|
||||||
|
min_weight_distance=min_weight_distance,
|
||||||
)
|
)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
@@ -189,50 +214,24 @@ def run():
|
|||||||
f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
|
f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
|
||||||
)
|
)
|
||||||
print("* Parameters:")
|
print("* Parameters:")
|
||||||
print(f" * max_weight = [bold]{max_weight:.4f}[/]")
|
for name, value in trial.params.items():
|
||||||
print(f" * max_weight_position = [bold]{max_weight_position:.4f}[/]")
|
print(f" * {name} = [bold]{value:.4f}[/]")
|
||||||
print(f" * min_weight = [bold]{min_weight:.4f}[/]")
|
|
||||||
print(f" * min_weight_distance = [bold]{min_weight_distance:.4f}[/]")
|
|
||||||
print("* Reloading model...")
|
print("* Reloading model...")
|
||||||
model.reload_model()
|
model.reload_model()
|
||||||
print("* Abliterating...")
|
print("* Abliterating...")
|
||||||
model.abliterate(
|
model.abliterate(refusal_directions, parameters)
|
||||||
refusal_directions,
|
|
||||||
max_weight,
|
|
||||||
max_weight_position,
|
|
||||||
min_weight,
|
|
||||||
min_weight_distance,
|
|
||||||
)
|
|
||||||
print("* Evaluating...")
|
print("* Evaluating...")
|
||||||
score, kl_divergence, refusals = evaluator.get_score()
|
score, kl_divergence, refusals = evaluator.get_score()
|
||||||
|
|
||||||
trial.set_user_attr("kl_divergence", kl_divergence)
|
trial.set_user_attr("kl_divergence", kl_divergence)
|
||||||
trial.set_user_attr("refusals", refusals)
|
trial.set_user_attr("refusals", refusals)
|
||||||
|
trial.set_user_attr("parameters", parameters)
|
||||||
|
|
||||||
# The optimizer searches for a minimum, so we return the negative score.
|
# The optimizer searches for a minimum, so we return the negative score.
|
||||||
return -score
|
return -score
|
||||||
|
|
||||||
study = optuna.create_study()
|
study = optuna.create_study()
|
||||||
|
|
||||||
# Educated guesses for parameter values to get the optimizer started.
|
|
||||||
for max_weight, max_weight_position, min_weight, min_weight_distance in [
|
|
||||||
(0.0, 0.0, 0.0, 0.5),
|
|
||||||
(1.0, 0.5, 0.0, 0.25),
|
|
||||||
(0.8, 0.7, 0.3, 0.4),
|
|
||||||
(0.9, 0.3, 0.1, 0.1),
|
|
||||||
(1.0, 1.0, 1.0, 1.0),
|
|
||||||
]:
|
|
||||||
study.enqueue_trial(
|
|
||||||
{
|
|
||||||
"max_weight": max_weight,
|
|
||||||
"max_weight_position": max_weight_position
|
|
||||||
* (len(model.get_layers()) - 1),
|
|
||||||
"min_weight": min_weight,
|
|
||||||
"min_weight_distance": min_weight_distance
|
|
||||||
* (len(model.get_layers()) - 1),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
study.optimize(objective, n_trials=settings.n_trials)
|
study.optimize(objective, n_trials=settings.n_trials)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
@@ -240,14 +239,8 @@ def run():
|
|||||||
f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
|
f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
|
||||||
)
|
)
|
||||||
print("* Parameters:")
|
print("* Parameters:")
|
||||||
print(f" * max_weight = [bold]{study.best_params['max_weight']:.4f}[/]")
|
for name, value in study.best_params.items():
|
||||||
print(
|
print(f" * {name} = [bold]{value:.4f}[/]")
|
||||||
f" * max_weight_position = [bold]{study.best_params['max_weight_position']:.4f}[/]"
|
|
||||||
)
|
|
||||||
print(f" * min_weight = [bold]{study.best_params['min_weight']:.4f}[/]")
|
|
||||||
print(
|
|
||||||
f" * min_weight_distance = [bold]{study.best_params['min_weight_distance']:.4f}[/]"
|
|
||||||
)
|
|
||||||
print("* Results:")
|
print("* Results:")
|
||||||
print(
|
print(
|
||||||
f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
|
f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
|
||||||
@@ -263,13 +256,7 @@ def run():
|
|||||||
print("* Reloading model...")
|
print("* Reloading model...")
|
||||||
model.reload_model()
|
model.reload_model()
|
||||||
print("* Abliterating...")
|
print("* Abliterating...")
|
||||||
model.abliterate(
|
model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"])
|
||||||
refusal_directions,
|
|
||||||
study.best_params["max_weight"],
|
|
||||||
study.best_params["max_weight_position"],
|
|
||||||
study.best_params["min_weight"],
|
|
||||||
study.best_params["min_weight_distance"],
|
|
||||||
)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
print()
|
print()
|
||||||
|
|||||||
+41
-27
@@ -2,6 +2,7 @@
|
|||||||
# Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com>
|
# Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com>
|
||||||
|
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
|
from dataclasses import dataclass
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -21,6 +22,14 @@ from .config import Settings
|
|||||||
from .utils import batchify, empty_cache, print
|
from .utils import batchify, empty_cache, print
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AbliterationParameters:
|
||||||
|
max_weight: float
|
||||||
|
max_weight_position: float
|
||||||
|
min_weight: float
|
||||||
|
min_weight_distance: float
|
||||||
|
|
||||||
|
|
||||||
class Model:
|
class Model:
|
||||||
def __init__(self, settings: Settings):
|
def __init__(self, settings: Settings):
|
||||||
self.settings = settings
|
self.settings = settings
|
||||||
@@ -61,8 +70,10 @@ class Model:
|
|||||||
raise Exception("Failed to load model with all configured dtypes.")
|
raise Exception("Failed to load model with all configured dtypes.")
|
||||||
|
|
||||||
print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers")
|
print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers")
|
||||||
|
print("* Abliterable components:")
|
||||||
|
for component, matrices in self.get_layer_matrices(0).items():
|
||||||
print(
|
print(
|
||||||
f"* [bold]{len(self.get_layer_matrices(0))}[/] abliterable matrices per layer"
|
f" * [bold]{component}[/]: [bold]{len(matrices)}[/] matrices per layer"
|
||||||
)
|
)
|
||||||
|
|
||||||
def reload_model(self):
|
def reload_model(self):
|
||||||
@@ -86,71 +97,74 @@ class Model:
|
|||||||
# Text-only models.
|
# Text-only models.
|
||||||
return self.model.model.layers
|
return self.model.model.layers
|
||||||
|
|
||||||
def get_layer_matrices(self, layer_index: int) -> list[torch.Tensor]:
|
def get_layer_matrices(self, layer_index: int) -> dict[str, list[torch.Tensor]]:
|
||||||
layer = self.get_layers()[layer_index]
|
layer = self.get_layers()[layer_index]
|
||||||
|
|
||||||
matrices = []
|
matrices = {}
|
||||||
|
|
||||||
def try_add(matrix: Any):
|
def try_add(component: str, matrix: Any):
|
||||||
assert torch.is_tensor(matrix)
|
assert torch.is_tensor(matrix)
|
||||||
matrices.append(matrix)
|
|
||||||
|
if component not in matrices:
|
||||||
|
matrices[component] = []
|
||||||
|
|
||||||
|
matrices[component].append(matrix)
|
||||||
|
|
||||||
|
# Exceptions aren't suppressed here, because there is currently
|
||||||
|
# no alternative location for the attention out-projection.
|
||||||
|
try_add("attn.o_proj", layer.self_attn.o_proj.weight)
|
||||||
|
|
||||||
# Most dense models.
|
# Most dense models.
|
||||||
if not matrices:
|
|
||||||
with suppress(Exception):
|
with suppress(Exception):
|
||||||
try_add(layer.mlp.down_proj.weight)
|
try_add("mlp.down_proj", layer.mlp.down_proj.weight)
|
||||||
|
|
||||||
# Some MoE models (e.g. Qwen3).
|
# Some MoE models (e.g. Qwen3).
|
||||||
if not matrices:
|
|
||||||
with suppress(Exception):
|
with suppress(Exception):
|
||||||
for expert in layer.mlp.experts:
|
for expert in layer.mlp.experts:
|
||||||
try_add(expert.down_proj.weight)
|
try_add("mlp.down_proj", expert.down_proj.weight)
|
||||||
|
|
||||||
# Phi-3.5-MoE (and possibly others).
|
# Phi-3.5-MoE (and possibly others).
|
||||||
if not matrices:
|
|
||||||
with suppress(Exception):
|
with suppress(Exception):
|
||||||
for expert in layer.block_sparse_moe.experts:
|
for expert in layer.block_sparse_moe.experts:
|
||||||
try_add(expert.w2.weight)
|
try_add("mlp.down_proj", expert.w2.weight)
|
||||||
|
|
||||||
# gpt-oss MoE.
|
# gpt-oss MoE.
|
||||||
if not matrices:
|
|
||||||
with suppress(Exception):
|
with suppress(Exception):
|
||||||
# The implementation of gpt-oss in Transformers differs from many other MoE models
|
# The implementation of gpt-oss in Transformers differs from many other MoE models
|
||||||
# in that it stores the down-projections for all experts in a single 3D tensor,
|
# in that it stores the down-projections for all experts in a single 3D tensor,
|
||||||
# but thanks to PyTorch's broadcasting magic, it all just works anyway.
|
# but thanks to PyTorch's broadcasting magic, it all just works anyway.
|
||||||
try_add(layer.mlp.experts.down_proj)
|
try_add("mlp.down_proj", layer.mlp.experts.down_proj)
|
||||||
|
|
||||||
# We need at least one MLP down-projection.
|
# We need at least one MLP down-projection.
|
||||||
assert matrices
|
assert matrices["mlp.down_proj"]
|
||||||
|
|
||||||
# Exceptions aren't suppressed here, because there is currently
|
|
||||||
# no alternative location for the attention out-projection.
|
|
||||||
try_add(layer.self_attn.o_proj.weight)
|
|
||||||
|
|
||||||
return matrices
|
return matrices
|
||||||
|
|
||||||
|
def get_abliterable_components(self) -> list[str]:
|
||||||
|
return list(self.get_layer_matrices(0).keys())
|
||||||
|
|
||||||
def abliterate(
|
def abliterate(
|
||||||
self,
|
self,
|
||||||
refusal_directions: torch.Tensor,
|
refusal_directions: torch.Tensor,
|
||||||
max_weight: float,
|
parameters: dict[str, AbliterationParameters],
|
||||||
max_weight_position: float,
|
|
||||||
min_weight: float,
|
|
||||||
min_weight_distance: float,
|
|
||||||
):
|
):
|
||||||
# Note that some implementations of abliteration also orthogonalize
|
# Note that some implementations of abliteration also orthogonalize
|
||||||
# the embedding matrix, but it's unclear if that has any benefits.
|
# the embedding matrix, but it's unclear if that has any benefits.
|
||||||
for layer_index in range(len(self.get_layers())):
|
for layer_index in range(len(self.get_layers())):
|
||||||
distance = abs(layer_index - max_weight_position)
|
for component, matrices in self.get_layer_matrices(layer_index).items():
|
||||||
|
params = parameters[component]
|
||||||
|
|
||||||
|
distance = abs(layer_index - params.max_weight_position)
|
||||||
|
|
||||||
# Don't orthogonalize layers that are more than
|
# Don't orthogonalize layers that are more than
|
||||||
# min_weight_distance away from max_weight_position.
|
# min_weight_distance away from max_weight_position.
|
||||||
if distance > min_weight_distance:
|
if distance > params.min_weight_distance:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Interpolate linearly between max_weight and min_weight
|
# Interpolate linearly between max_weight and min_weight
|
||||||
# over min_weight_distance.
|
# over min_weight_distance.
|
||||||
weight = max_weight + (distance / min_weight_distance) * (
|
weight = params.max_weight + (distance / params.min_weight_distance) * (
|
||||||
min_weight - max_weight
|
params.min_weight - params.max_weight
|
||||||
)
|
)
|
||||||
|
|
||||||
# The index must be shifted by 1 because the first element
|
# The index must be shifted by 1 because the first element
|
||||||
@@ -161,7 +175,7 @@ class Model:
|
|||||||
# spanned by the refusal direction.
|
# spanned by the refusal direction.
|
||||||
projector = torch.outer(refusal_direction, refusal_direction)
|
projector = torch.outer(refusal_direction, refusal_direction)
|
||||||
|
|
||||||
for matrix in self.get_layer_matrices(layer_index):
|
for matrix in matrices:
|
||||||
# In-place subtraction is safe as we're not using Autograd.
|
# In-place subtraction is safe as we're not using Autograd.
|
||||||
matrix.sub_(weight * (projector @ matrix))
|
matrix.sub_(weight * (projector @ matrix))
|
||||||
|
|
||||||
|
|||||||
+17
-9
@@ -54,27 +54,35 @@ def get_readme_intro(
|
|||||||
base_refusals: int,
|
base_refusals: int,
|
||||||
bad_prompts: list[str],
|
bad_prompts: list[str],
|
||||||
) -> str:
|
) -> str:
|
||||||
|
model_link = f"[{settings.model}](https://huggingface.co/{settings.model})"
|
||||||
refusal_percentage = (
|
refusal_percentage = (
|
||||||
study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100
|
study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100
|
||||||
)
|
)
|
||||||
base_refusal_percentage = base_refusals / len(bad_prompts) * 100
|
base_refusal_percentage = base_refusals / len(bad_prompts) * 100
|
||||||
|
|
||||||
return f"""# This is a decensored version of [{settings.model}](https://huggingface.co/{settings.model}), made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
|
return f"""# This is a decensored version of {
|
||||||
|
model_link
|
||||||
|
}, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
|
||||||
|
|
||||||
## Abliteration parameters
|
## Abliteration parameters
|
||||||
|
|
||||||
| Parameter | Value |
|
| Parameter | Value |
|
||||||
| :---------------------- | :--------------------------------------------: |
|
| :-------- | :---: |
|
||||||
| **max_weight** | {study.best_params["max_weight"]:.4f} |
|
{
|
||||||
| **max_weight_position** | {study.best_params["max_weight_position"]:.4f} |
|
chr(10).join(
|
||||||
| **min_weight** | {study.best_params["min_weight"]:.4f} |
|
[
|
||||||
| **min_weight_distance** | {study.best_params["min_weight_distance"]:.4f} |
|
f"| **{name}** | {value:.4f} |"
|
||||||
|
for name, value in study.best_params.items()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
## Performance
|
## Performance
|
||||||
|
|
||||||
| Metric | This model | Original model ([{settings.model}](https://huggingface.co/{settings.model})) |
|
| Metric | This model | Original model ({model_link}) |
|
||||||
| :---------------- | :------------------------------------------------: | :--------------------------------------------------------------------------: |
|
| :----- | :--------: | :---------------------------: |
|
||||||
| **KL divergence** | {study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
|
| **KL divergence** | {
|
||||||
|
study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
|
||||||
| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
|
| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
|
||||||
|
|
||||||
-----
|
-----
|
||||||
|
|||||||
Reference in New Issue
Block a user