Separate abliteration parameters for different layer components

2025-10-22 12:05:28 +05:30
parent ed65d6902b
commit 6359aa44bb
3 changed files with 131 additions and 122 deletions
@@ -26,7 +26,7 @@ from rich.traceback import install

 from .config import Settings
 from .evaluator import Evaluator
-from .model import Model
+from .model import AbliterationParameters, Model
 from .utils import get_readme_intro, load_prompts, print


@@ -175,64 +175,63 @@ def run():
        trial_index += 1
        trial.set_user_attr("index", trial_index)

-        max_weight = trial.suggest_float("max_weight", 0, 1)
-        max_weight_position = trial.suggest_float(
-            "max_weight_position", 0, len(model.get_layers()) - 1
-        )
-        min_weight = trial.suggest_float("min_weight", 0, max_weight)
-        min_weight_distance = trial.suggest_float(
-            "min_weight_distance", 1, len(model.get_layers()) - 1
-        )
+        parameters = {}
+
+        for component in model.get_abliterable_components():
+            # The parameter ranges are based on experiments with various models
+            # and much wider ranges. They are not set in stone and might have to be
+            # adjusted for future models.
+            max_weight = trial.suggest_float(
+                f"{component}.max_weight",
+                0.8,
+                1.2,
+            )
+            max_weight_position = trial.suggest_float(
+                f"{component}.max_weight_position",
+                0.6 * (len(model.get_layers()) - 1),
+                len(model.get_layers()) - 1,
+            )
+            min_weight = trial.suggest_float(
+                f"{component}.min_weight",
+                0.0,
+                max_weight,
+            )
+            min_weight_distance = trial.suggest_float(
+                f"{component}.min_weight_distance",
+                1.0,
+                0.6 * (len(model.get_layers()) - 1),
+            )
+
+            parameters[component] = AbliterationParameters(
+                max_weight=max_weight,
+                max_weight_position=max_weight_position,
+                min_weight=min_weight,
+                min_weight_distance=min_weight_distance,
+            )

        print()
        print(
            f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..."
        )
        print("* Parameters:")
-        print(f"  * max_weight = [bold]{max_weight:.4f}[/]")
-        print(f"  * max_weight_position = [bold]{max_weight_position:.4f}[/]")
-        print(f"  * min_weight = [bold]{min_weight:.4f}[/]")
-        print(f"  * min_weight_distance = [bold]{min_weight_distance:.4f}[/]")
+        for name, value in trial.params.items():
+            print(f"  * {name} = [bold]{value:.4f}[/]")
        print("* Reloading model...")
        model.reload_model()
        print("* Abliterating...")
-        model.abliterate(
-            refusal_directions,
-            max_weight,
-            max_weight_position,
-            min_weight,
-            min_weight_distance,
-        )
+        model.abliterate(refusal_directions, parameters)
        print("* Evaluating...")
        score, kl_divergence, refusals = evaluator.get_score()

        trial.set_user_attr("kl_divergence", kl_divergence)
        trial.set_user_attr("refusals", refusals)
+        trial.set_user_attr("parameters", parameters)

        # The optimizer searches for a minimum, so we return the negative score.
        return -score

    study = optuna.create_study()

-    # Educated guesses for parameter values to get the optimizer started.
-    for max_weight, max_weight_position, min_weight, min_weight_distance in [
-        (0.0, 0.0, 0.0, 0.5),
-        (1.0, 0.5, 0.0, 0.25),
-        (0.8, 0.7, 0.3, 0.4),
-        (0.9, 0.3, 0.1, 0.1),
-        (1.0, 1.0, 1.0, 1.0),
-    ]:
-        study.enqueue_trial(
-            {
-                "max_weight": max_weight,
-                "max_weight_position": max_weight_position
-                * (len(model.get_layers()) - 1),
-                "min_weight": min_weight,
-                "min_weight_distance": min_weight_distance
-                * (len(model.get_layers()) - 1),
-            }
-        )
-
    study.optimize(objective, n_trials=settings.n_trials)

    print()
@@ -240,14 +239,8 @@ def run():
        f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:"
    )
    print("* Parameters:")
-    print(f"  * max_weight = [bold]{study.best_params['max_weight']:.4f}[/]")
-    print(
-        f"  * max_weight_position = [bold]{study.best_params['max_weight_position']:.4f}[/]"
-    )
-    print(f"  * min_weight = [bold]{study.best_params['min_weight']:.4f}[/]")
-    print(
-        f"  * min_weight_distance = [bold]{study.best_params['min_weight_distance']:.4f}[/]"
-    )
+    for name, value in study.best_params.items():
+        print(f"  * {name} = [bold]{value:.4f}[/]")
    print("* Results:")
    print(
        f"  * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]"
@@ -263,13 +256,7 @@ def run():
    print("* Reloading model...")
    model.reload_model()
    print("* Abliterating...")
-    model.abliterate(
-        refusal_directions,
-        study.best_params["max_weight"],
-        study.best_params["max_weight_position"],
-        study.best_params["min_weight"],
-        study.best_params["min_weight_distance"],
-    )
+    model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"])

    while True:
        print()
@@ -2,6 +2,7 @@
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>

 from contextlib import suppress
+from dataclasses import dataclass
 from typing import Any

 import torch
@@ -21,6 +22,14 @@ from .config import Settings
 from .utils import batchify, empty_cache, print


+@dataclass
+class AbliterationParameters:
+    max_weight: float
+    max_weight_position: float
+    min_weight: float
+    min_weight_distance: float
+
+
 class Model:
    def __init__(self, settings: Settings):
        self.settings = settings
@@ -61,9 +70,11 @@ class Model:
            raise Exception("Failed to load model with all configured dtypes.")

        print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers")
-        print(
-            f"* [bold]{len(self.get_layer_matrices(0))}[/] abliterable matrices per layer"
-        )
+        print("* Abliterable components:")
+        for component, matrices in self.get_layer_matrices(0).items():
+            print(
+                f"  * [bold]{component}[/]: [bold]{len(matrices)}[/] matrices per layer"
+            )

    def reload_model(self):
        dtype = self.model.dtype
@@ -86,84 +97,87 @@ class Model:
        # Text-only models.
        return self.model.model.layers

-    def get_layer_matrices(self, layer_index: int) -> list[torch.Tensor]:
+    def get_layer_matrices(self, layer_index: int) -> dict[str, list[torch.Tensor]]:
        layer = self.get_layers()[layer_index]

-        matrices = []
+        matrices = {}

-        def try_add(matrix: Any):
+        def try_add(component: str, matrix: Any):
            assert torch.is_tensor(matrix)
-            matrices.append(matrix)

-        # Most dense models.
-        if not matrices:
-            with suppress(Exception):
-                try_add(layer.mlp.down_proj.weight)
+            if component not in matrices:
+                matrices[component] = []

-        # Some MoE models (e.g. Qwen3).
-        if not matrices:
-            with suppress(Exception):
-                for expert in layer.mlp.experts:
-                    try_add(expert.down_proj.weight)
-
-        # Phi-3.5-MoE (and possibly others).
-        if not matrices:
-            with suppress(Exception):
-                for expert in layer.block_sparse_moe.experts:
-                    try_add(expert.w2.weight)
-
-        # gpt-oss MoE.
-        if not matrices:
-            with suppress(Exception):
-                # The implementation of gpt-oss in Transformers differs from many other MoE models
-                # in that it stores the down-projections for all experts in a single 3D tensor,
-                # but thanks to PyTorch's broadcasting magic, it all just works anyway.
-                try_add(layer.mlp.experts.down_proj)
-
-        # We need at least one MLP down-projection.
-        assert matrices
+            matrices[component].append(matrix)

        # Exceptions aren't suppressed here, because there is currently
        # no alternative location for the attention out-projection.
-        try_add(layer.self_attn.o_proj.weight)
+        try_add("attn.o_proj", layer.self_attn.o_proj.weight)
+
+        # Most dense models.
+        with suppress(Exception):
+            try_add("mlp.down_proj", layer.mlp.down_proj.weight)
+
+        # Some MoE models (e.g. Qwen3).
+        with suppress(Exception):
+            for expert in layer.mlp.experts:
+                try_add("mlp.down_proj", expert.down_proj.weight)
+
+        # Phi-3.5-MoE (and possibly others).
+        with suppress(Exception):
+            for expert in layer.block_sparse_moe.experts:
+                try_add("mlp.down_proj", expert.w2.weight)
+
+        # gpt-oss MoE.
+        with suppress(Exception):
+            # The implementation of gpt-oss in Transformers differs from many other MoE models
+            # in that it stores the down-projections for all experts in a single 3D tensor,
+            # but thanks to PyTorch's broadcasting magic, it all just works anyway.
+            try_add("mlp.down_proj", layer.mlp.experts.down_proj)
+
+        # We need at least one MLP down-projection.
+        assert matrices["mlp.down_proj"]

        return matrices

+    def get_abliterable_components(self) -> list[str]:
+        return list(self.get_layer_matrices(0).keys())
+
    def abliterate(
        self,
        refusal_directions: torch.Tensor,
-        max_weight: float,
-        max_weight_position: float,
-        min_weight: float,
-        min_weight_distance: float,
+        parameters: dict[str, AbliterationParameters],
    ):
        # Note that some implementations of abliteration also orthogonalize
        # the embedding matrix, but it's unclear if that has any benefits.
        for layer_index in range(len(self.get_layers())):
-            distance = abs(layer_index - max_weight_position)
+            for component, matrices in self.get_layer_matrices(layer_index).items():
+                params = parameters[component]

-            # Don't orthogonalize layers that are more than
-            # min_weight_distance away from max_weight_position.
-            if distance > min_weight_distance:
-                continue
+                distance = abs(layer_index - params.max_weight_position)

-            # Interpolate linearly between max_weight and min_weight
-            # over min_weight_distance.
-            weight = max_weight + (distance / min_weight_distance) * (
-                min_weight - max_weight
-            )
+                # Don't orthogonalize layers that are more than
+                # min_weight_distance away from max_weight_position.
+                if distance > params.min_weight_distance:
+                    continue

-            # The index must be shifted by 1 because the first element
-            # of refusal_directions is the direction for the embeddings.
-            refusal_direction = refusal_directions[layer_index + 1]
+                # Interpolate linearly between max_weight and min_weight
+                # over min_weight_distance.
+                weight = params.max_weight + (distance / params.min_weight_distance) * (
+                    params.min_weight - params.max_weight
+                )

-            # Projects any right-multiplied vector(s) onto the subspace
-            # spanned by the refusal direction.
-            projector = torch.outer(refusal_direction, refusal_direction)
+                # The index must be shifted by 1 because the first element
+                # of refusal_directions is the direction for the embeddings.
+                refusal_direction = refusal_directions[layer_index + 1]

-            for matrix in self.get_layer_matrices(layer_index):
-                # In-place subtraction is safe as we're not using Autograd.
-                matrix.sub_(weight * (projector @ matrix))
+                # Projects any right-multiplied vector(s) onto the subspace
+                # spanned by the refusal direction.
+                projector = torch.outer(refusal_direction, refusal_direction)
+
+                for matrix in matrices:
+                    # In-place subtraction is safe as we're not using Autograd.
+                    matrix.sub_(weight * (projector @ matrix))

    def get_chat(self, prompt: str) -> list[dict[str, str]]:
        return [
@@ -54,28 +54,36 @@ def get_readme_intro(
    base_refusals: int,
    bad_prompts: list[str],
 ) -> str:
+    model_link = f"[{settings.model}](https://huggingface.co/{settings.model})"
    refusal_percentage = (
        study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100
    )
    base_refusal_percentage = base_refusals / len(bad_prompts) * 100

-    return f"""# This is a decensored version of [{settings.model}](https://huggingface.co/{settings.model}), made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}
+    return f"""# This is a decensored version of {
+        model_link
+    }, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")}

 ## Abliteration parameters

-| Parameter               | Value                                          |
-| :---------------------- | :--------------------------------------------: |
-| **max_weight**          | {study.best_params["max_weight"]:.4f}          |
-| **max_weight_position** | {study.best_params["max_weight_position"]:.4f} |
-| **min_weight**          | {study.best_params["min_weight"]:.4f}          |
-| **min_weight_distance** | {study.best_params["min_weight_distance"]:.4f} |
+| Parameter | Value |
+| :-------- | :---: |
+{
+        chr(10).join(
+            [
+                f"| **{name}** | {value:.4f} |"
+                for name, value in study.best_params.items()
+            ]
+        )
+    }

 ## Performance

-| Metric            | This model                                         | Original model ([{settings.model}](https://huggingface.co/{settings.model})) |
-| :---------------- | :------------------------------------------------: | :--------------------------------------------------------------------------: |
-| **KL divergence** | {study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)*                                                          |
-| **Refusals**      | {refusal_percentage:.1f} %                         | {base_refusal_percentage:.1f} %                                              |
+| Metric | This model | Original model ({model_link}) |
+| :----- | :--------: | :---------------------------: |
+| **KL divergence** | {
+        study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* |
+| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % |

 -----