Separate abliteration parameters for different layer components

2025-10-22 12:05:28 +05:30
parent ed65d6902b
commit 6359aa44bb
3 changed files with 131 additions and 122 deletions
@@ -26,7 +26,7 @@ from rich.traceback import install
 from .config import Settings
 from .evaluator import Evaluator
-from .model import Model
+from .model import AbliterationParameters, Model
 from .utils import get_readme_intro, load_prompts, print
@@ -175,13 +175,38 @@ def run():
        trial_index += 1
        trial.set_user_attr("index", trial_index)
-        max_weight = trial.suggest_float("max_weight", 0, 1)
+        parameters = {}
-        max_weight_position = trial.suggest_float(
+
-            "max_weight_position", 0, len(model.get_layers()) - 1
+        for component in model.get_abliterable_components():
            # The parameter ranges are based on experiments with various models
            # and much wider ranges. They are not set in stone and might have to be
            # adjusted for future models.
            max_weight = trial.suggest_float(
                f"{component}.max_weight",
                0.8,
                1.2,
            )
            max_weight_position = trial.suggest_float(
                f"{component}.max_weight_position",
                0.6 * (len(model.get_layers()) - 1),
                len(model.get_layers()) - 1,
            )
            min_weight = trial.suggest_float(
                f"{component}.min_weight",
                0.0,
                max_weight,
            )
        min_weight = trial.suggest_float("min_weight", 0, max_weight)
            min_weight_distance = trial.suggest_float(
-            "min_weight_distance", 1, len(model.get_layers()) - 1
+                f"{component}.min_weight_distance",
                1.0,
                0.6 * (len(model.get_layers()) - 1),
            )
            parameters[component] = AbliterationParameters(
                max_weight=max_weight,
                max_weight_position=max_weight_position,
                min_weight=min_weight,
                min_weight_distance=min_weight_distance,
            )
        print()
@@ -189,50 +214,24 @@ def run():
            f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
        )
        print("* Parameters:")
-        print(f"  * max_weight = [bold]{max_weight:.4f}[/]")
+        for name, value in trial.params.items():
-        print(f"  * max_weight_position = [bold]{max_weight_position:.4f}[/]")
+            print(f"  * {name} = [bold]{value:.4f}[/]")
        print(f"  * min_weight = [bold]{min_weight:.4f}[/]")
        print(f"  * min_weight_distance = [bold]{min_weight_distance:.4f}[/]")
        print("* Reloading model...")
        model.reload_model()
        print("* Abliterating...")
-        model.abliterate(
+        model.abliterate(refusal_directions, parameters)
            refusal_directions,
            max_weight,
            max_weight_position,
            min_weight,
            min_weight_distance,
        )
        print("* Evaluating...")
        score, kl_divergence, refusals = evaluator.get_score()
        trial.set_user_attr("kl_divergence", kl_divergence)
        trial.set_user_attr("refusals", refusals)
        trial.set_user_attr("parameters", parameters)
        # The optimizer searches for a minimum, so we return the negative score.
        return -score
    study = optuna.create_study()
    # Educated guesses for parameter values to get the optimizer started.
    for max_weight, max_weight_position, min_weight, min_weight_distance in [
        (0.0, 0.0, 0.0, 0.5),
        (1.0, 0.5, 0.0, 0.25),
        (0.8, 0.7, 0.3, 0.4),
        (0.9, 0.3, 0.1, 0.1),
        (1.0, 1.0, 1.0, 1.0),
    ]:
        study.enqueue_trial(
            {
                "max_weight": max_weight,
                "max_weight_position": max_weight_position
                * (len(model.get_layers()) - 1),
                "min_weight": min_weight,
                "min_weight_distance": min_weight_distance
                * (len(model.get_layers()) - 1),
            }
        )
    study.optimize(objective, n_trials=settings.n_trials)
    print()
@@ -240,14 +239,8 @@ def run():
        f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
    )
    print("* Parameters:")
-    print(f"  * max_weight = [bold]{study.best_params['max_weight']:.4f}[/]")
+    for name, value in study.best_params.items():
-    print(
+        print(f"  * {name} = [bold]{value:.4f}[/]")
        f"  * max_weight_position = [bold]{study.best_params['max_weight_position']:.4f}[/]"
    )
    print(f"  * min_weight = [bold]{study.best_params['min_weight']:.4f}[/]")
    print(
        f"  * min_weight_distance = [bold]{study.best_params['min_weight_distance']:.4f}[/]"
    )
    print("* Results:")
    print(
        f"  * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
@@ -263,13 +256,7 @@ def run():
    print("* Reloading model...")
    model.reload_model()
    print("* Abliterating...")
-    model.abliterate(
+    model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"])
        refusal_directions,
        study.best_params["max_weight"],
        study.best_params["max_weight_position"],
        study.best_params["min_weight"],
        study.best_params["min_weight_distance"],
    )
    while True:
        print()
@@ -2,6 +2,7 @@
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>
 from contextlib import suppress
 from dataclasses import dataclass
 from typing import Any
 import torch
@@ -21,6 +22,14 @@ from .config import Settings
 from .utils import batchify, empty_cache, print
@dataclass
 class AbliterationParameters:
    max_weight: float
    max_weight_position: float
    min_weight: float
    min_weight_distance: float
 class Model:
    def __init__(self, settings: Settings):
        self.settings = settings
@@ -61,8 +70,10 @@ class Model:
            raise Exception("Failed to load model with all configured dtypes.")
        print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers")
        print("* Abliterable components:")
        for component, matrices in self.get_layer_matrices(0).items():
            print(
-            f"* [bold]{len(self.get_layer_matrices(0))}[/] abliterable matrices per layer"
+                f"  * [bold]{component}[/]: [bold]{len(matrices)}[/] matrices per layer"
            )
    def reload_model(self):
@@ -86,71 +97,74 @@ class Model:
        # Text-only models.
        return self.model.model.layers
-    def get_layer_matrices(self, layer_index: int) -> list[torch.Tensor]:
+    def get_layer_matrices(self, layer_index: int) -> dict[str, list[torch.Tensor]]:
        layer = self.get_layers()[layer_index]
-        matrices = []
+        matrices = {}
-        def try_add(matrix: Any):
+        def try_add(component: str, matrix: Any):
            assert torch.is_tensor(matrix)
-            matrices.append(matrix)
+
            if component not in matrices:
                matrices[component] = []
            matrices[component].append(matrix)
        # Exceptions aren't suppressed here, because there is currently
        # no alternative location for the attention out-projection.
        try_add("attn.o_proj", layer.self_attn.o_proj.weight)
        # Most dense models.
        if not matrices:
        with suppress(Exception):
-                try_add(layer.mlp.down_proj.weight)
+            try_add("mlp.down_proj", layer.mlp.down_proj.weight)
        # Some MoE models (e.g. Qwen3).
        if not matrices:
        with suppress(Exception):
            for expert in layer.mlp.experts:
-                    try_add(expert.down_proj.weight)
+                try_add("mlp.down_proj", expert.down_proj.weight)
        # Phi-3.5-MoE (and possibly others).
        if not matrices:
        with suppress(Exception):
            for expert in layer.block_sparse_moe.experts:
-                    try_add(expert.w2.weight)
+                try_add("mlp.down_proj", expert.w2.weight)
        # gpt-oss MoE.
        if not matrices:
        with suppress(Exception):
            # The implementation of gpt-oss in Transformers differs from many other MoE models
            # in that it stores the down-projections for all experts in a single 3D tensor,
            # but thanks to PyTorch's broadcasting magic, it all just works anyway.
-                try_add(layer.mlp.experts.down_proj)
+            try_add("mlp.down_proj", layer.mlp.experts.down_proj)
        # We need at least one MLP down-projection.
-        assert matrices
+        assert matrices["mlp.down_proj"]
        # Exceptions aren't suppressed here, because there is currently
        # no alternative location for the attention out-projection.
        try_add(layer.self_attn.o_proj.weight)
        return matrices
    def get_abliterable_components(self) -> list[str]:
        return list(self.get_layer_matrices(0).keys())
    def abliterate(
        self,
        refusal_directions: torch.Tensor,
-        max_weight: float,
+        parameters: dict[str, AbliterationParameters],
        max_weight_position: float,
        min_weight: float,
        min_weight_distance: float,
    ):
        # Note that some implementations of abliteration also orthogonalize
        # the embedding matrix, but it's unclear if that has any benefits.
        for layer_index in range(len(self.get_layers())):
-            distance = abs(layer_index - max_weight_position)
+            for component, matrices in self.get_layer_matrices(layer_index).items():
                params = parameters[component]
                distance = abs(layer_index - params.max_weight_position)
                # Don't orthogonalize layers that are more than
                # min_weight_distance away from max_weight_position.
-            if distance > min_weight_distance:
+                if distance > params.min_weight_distance:
                    continue
                # Interpolate linearly between max_weight and min_weight
                # over min_weight_distance.
-            weight = max_weight + (distance / min_weight_distance) * (
+                weight = params.max_weight + (distance / params.min_weight_distance) * (
-                min_weight - max_weight
+                    params.min_weight - params.max_weight
                )
                # The index must be shifted by 1 because the first element
@@ -161,7 +175,7 @@ class Model:
                # spanned by the refusal direction.
                projector = torch.outer(refusal_direction, refusal_direction)
-            for matrix in self.get_layer_matrices(layer_index):
+                for matrix in matrices:
                    # In-place subtraction is safe as we're not using Autograd.
                    matrix.sub_(weight * (projector @ matrix))
@@ -54,27 +54,35 @@ def get_readme_intro(
    base_refusals: int,
    bad_prompts: list[str],
 ) -> str:
    model_link = f"[{settings.model}](https://huggingface.co/{settings.model})"
    refusal_percentage = (
        study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100
    )
    base_refusal_percentage = base_refusals / len(bad_prompts) * 100
-    return f"""# This is a decensored version of [{settings.model}](https://huggingface.co/{settings.model}), made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
+    return f"""# This is a decensored version of {
        model_link
    }, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
 ## Abliteration parameters
 | Parameter | Value |
-| :---------------------- | :--------------------------------------------: |
+| :-------- | :---: |
-| **max_weight**          | {study.best_params["max_weight"]:.4f}          |
+{
-| **max_weight_position** | {study.best_params["max_weight_position"]:.4f} |
+        chr(10).join(
-| **min_weight**          | {study.best_params["min_weight"]:.4f}          |
+            [
-| **min_weight_distance** | {study.best_params["min_weight_distance"]:.4f} |
+                f"| **{name}** | {value:.4f} |"
                for name, value in study.best_params.items()
            ]
        )
    }
 ## Performance
-| Metric            | This model                                         | Original model ([{settings.model}](https://huggingface.co/{settings.model})) |
+| Metric | This model | Original model ({model_link}) |
-| :---------------- | :------------------------------------------------: | :--------------------------------------------------------------------------: |
+| :----- | :--------: | :---------------------------: |
-| **KL divergence** | {study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)*                                                          |
+| **KL divergence** | {
        study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
 | **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |
 -----