diff --git a/src/heretic/main.py b/src/heretic/main.py index ec8f41c..788e50b 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -26,7 +26,7 @@ from rich.traceback import install from .config import Settings from .evaluator import Evaluator -from .model import Model +from .model import AbliterationParameters, Model from .utils import get_readme_intro, load_prompts, print @@ -175,64 +175,63 @@ def run(): trial_index += 1 trial.set_user_attr("index", trial_index) - max_weight = trial.suggest_float("max_weight", 0, 1) - max_weight_position = trial.suggest_float( - "max_weight_position", 0, len(model.get_layers()) - 1 - ) - min_weight = trial.suggest_float("min_weight", 0, max_weight) - min_weight_distance = trial.suggest_float( - "min_weight_distance", 1, len(model.get_layers()) - 1 - ) + parameters = {} + + for component in model.get_abliterable_components(): + # The parameter ranges are based on experiments with various models + # and much wider ranges. They are not set in stone and might have to be + # adjusted for future models. + max_weight = trial.suggest_float( + f"{component}.max_weight", + 0.8, + 1.2, + ) + max_weight_position = trial.suggest_float( + f"{component}.max_weight_position", + 0.6 * (len(model.get_layers()) - 1), + len(model.get_layers()) - 1, + ) + min_weight = trial.suggest_float( + f"{component}.min_weight", + 0.0, + max_weight, + ) + min_weight_distance = trial.suggest_float( + f"{component}.min_weight_distance", + 1.0, + 0.6 * (len(model.get_layers()) - 1), + ) + + parameters[component] = AbliterationParameters( + max_weight=max_weight, + max_weight_position=max_weight_position, + min_weight=min_weight, + min_weight_distance=min_weight_distance, + ) print() print( f"Running trial [bold]{trial_index}[/] of [bold]{settings.n_trials}[/]..." ) print("* Parameters:") - print(f" * max_weight = [bold]{max_weight:.4f}[/]") - print(f" * max_weight_position = [bold]{max_weight_position:.4f}[/]") - print(f" * min_weight = [bold]{min_weight:.4f}[/]") - print(f" * min_weight_distance = [bold]{min_weight_distance:.4f}[/]") + for name, value in trial.params.items(): + print(f" * {name} = [bold]{value:.4f}[/]") print("* Reloading model...") model.reload_model() print("* Abliterating...") - model.abliterate( - refusal_directions, - max_weight, - max_weight_position, - min_weight, - min_weight_distance, - ) + model.abliterate(refusal_directions, parameters) print("* Evaluating...") score, kl_divergence, refusals = evaluator.get_score() trial.set_user_attr("kl_divergence", kl_divergence) trial.set_user_attr("refusals", refusals) + trial.set_user_attr("parameters", parameters) # The optimizer searches for a minimum, so we return the negative score. return -score study = optuna.create_study() - # Educated guesses for parameter values to get the optimizer started. - for max_weight, max_weight_position, min_weight, min_weight_distance in [ - (0.0, 0.0, 0.0, 0.5), - (1.0, 0.5, 0.0, 0.25), - (0.8, 0.7, 0.3, 0.4), - (0.9, 0.3, 0.1, 0.1), - (1.0, 1.0, 1.0, 1.0), - ]: - study.enqueue_trial( - { - "max_weight": max_weight, - "max_weight_position": max_weight_position - * (len(model.get_layers()) - 1), - "min_weight": min_weight, - "min_weight_distance": min_weight_distance - * (len(model.get_layers()) - 1), - } - ) - study.optimize(objective, n_trials=settings.n_trials) print() @@ -240,14 +239,8 @@ def run(): f"[bold green]Optimization finished![/] Best was trial [bold]{study.best_trial.user_attrs['index']}[/]:" ) print("* Parameters:") - print(f" * max_weight = [bold]{study.best_params['max_weight']:.4f}[/]") - print( - f" * max_weight_position = [bold]{study.best_params['max_weight_position']:.4f}[/]" - ) - print(f" * min_weight = [bold]{study.best_params['min_weight']:.4f}[/]") - print( - f" * min_weight_distance = [bold]{study.best_params['min_weight_distance']:.4f}[/]" - ) + for name, value in study.best_params.items(): + print(f" * {name} = [bold]{value:.4f}[/]") print("* Results:") print( f" * KL divergence: [bold]{study.best_trial.user_attrs['kl_divergence']:.4f}[/]" @@ -263,13 +256,7 @@ def run(): print("* Reloading model...") model.reload_model() print("* Abliterating...") - model.abliterate( - refusal_directions, - study.best_params["max_weight"], - study.best_params["max_weight_position"], - study.best_params["min_weight"], - study.best_params["min_weight_distance"], - ) + model.abliterate(refusal_directions, study.best_trial.user_attrs["parameters"]) while True: print() diff --git a/src/heretic/model.py b/src/heretic/model.py index 4655411..e926ac8 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -2,6 +2,7 @@ # Copyright (C) 2025 Philipp Emanuel Weidmann from contextlib import suppress +from dataclasses import dataclass from typing import Any import torch @@ -21,6 +22,14 @@ from .config import Settings from .utils import batchify, empty_cache, print +@dataclass +class AbliterationParameters: + max_weight: float + max_weight_position: float + min_weight: float + min_weight_distance: float + + class Model: def __init__(self, settings: Settings): self.settings = settings @@ -61,9 +70,11 @@ class Model: raise Exception("Failed to load model with all configured dtypes.") print(f"* Transformer model with [bold]{len(self.get_layers())}[/] layers") - print( - f"* [bold]{len(self.get_layer_matrices(0))}[/] abliterable matrices per layer" - ) + print("* Abliterable components:") + for component, matrices in self.get_layer_matrices(0).items(): + print( + f" * [bold]{component}[/]: [bold]{len(matrices)}[/] matrices per layer" + ) def reload_model(self): dtype = self.model.dtype @@ -86,84 +97,87 @@ class Model: # Text-only models. return self.model.model.layers - def get_layer_matrices(self, layer_index: int) -> list[torch.Tensor]: + def get_layer_matrices(self, layer_index: int) -> dict[str, list[torch.Tensor]]: layer = self.get_layers()[layer_index] - matrices = [] + matrices = {} - def try_add(matrix: Any): + def try_add(component: str, matrix: Any): assert torch.is_tensor(matrix) - matrices.append(matrix) - # Most dense models. - if not matrices: - with suppress(Exception): - try_add(layer.mlp.down_proj.weight) + if component not in matrices: + matrices[component] = [] - # Some MoE models (e.g. Qwen3). - if not matrices: - with suppress(Exception): - for expert in layer.mlp.experts: - try_add(expert.down_proj.weight) - - # Phi-3.5-MoE (and possibly others). - if not matrices: - with suppress(Exception): - for expert in layer.block_sparse_moe.experts: - try_add(expert.w2.weight) - - # gpt-oss MoE. - if not matrices: - with suppress(Exception): - # The implementation of gpt-oss in Transformers differs from many other MoE models - # in that it stores the down-projections for all experts in a single 3D tensor, - # but thanks to PyTorch's broadcasting magic, it all just works anyway. - try_add(layer.mlp.experts.down_proj) - - # We need at least one MLP down-projection. - assert matrices + matrices[component].append(matrix) # Exceptions aren't suppressed here, because there is currently # no alternative location for the attention out-projection. - try_add(layer.self_attn.o_proj.weight) + try_add("attn.o_proj", layer.self_attn.o_proj.weight) + + # Most dense models. + with suppress(Exception): + try_add("mlp.down_proj", layer.mlp.down_proj.weight) + + # Some MoE models (e.g. Qwen3). + with suppress(Exception): + for expert in layer.mlp.experts: + try_add("mlp.down_proj", expert.down_proj.weight) + + # Phi-3.5-MoE (and possibly others). + with suppress(Exception): + for expert in layer.block_sparse_moe.experts: + try_add("mlp.down_proj", expert.w2.weight) + + # gpt-oss MoE. + with suppress(Exception): + # The implementation of gpt-oss in Transformers differs from many other MoE models + # in that it stores the down-projections for all experts in a single 3D tensor, + # but thanks to PyTorch's broadcasting magic, it all just works anyway. + try_add("mlp.down_proj", layer.mlp.experts.down_proj) + + # We need at least one MLP down-projection. + assert matrices["mlp.down_proj"] return matrices + def get_abliterable_components(self) -> list[str]: + return list(self.get_layer_matrices(0).keys()) + def abliterate( self, refusal_directions: torch.Tensor, - max_weight: float, - max_weight_position: float, - min_weight: float, - min_weight_distance: float, + parameters: dict[str, AbliterationParameters], ): # Note that some implementations of abliteration also orthogonalize # the embedding matrix, but it's unclear if that has any benefits. for layer_index in range(len(self.get_layers())): - distance = abs(layer_index - max_weight_position) + for component, matrices in self.get_layer_matrices(layer_index).items(): + params = parameters[component] - # Don't orthogonalize layers that are more than - # min_weight_distance away from max_weight_position. - if distance > min_weight_distance: - continue + distance = abs(layer_index - params.max_weight_position) - # Interpolate linearly between max_weight and min_weight - # over min_weight_distance. - weight = max_weight + (distance / min_weight_distance) * ( - min_weight - max_weight - ) + # Don't orthogonalize layers that are more than + # min_weight_distance away from max_weight_position. + if distance > params.min_weight_distance: + continue - # The index must be shifted by 1 because the first element - # of refusal_directions is the direction for the embeddings. - refusal_direction = refusal_directions[layer_index + 1] + # Interpolate linearly between max_weight and min_weight + # over min_weight_distance. + weight = params.max_weight + (distance / params.min_weight_distance) * ( + params.min_weight - params.max_weight + ) - # Projects any right-multiplied vector(s) onto the subspace - # spanned by the refusal direction. - projector = torch.outer(refusal_direction, refusal_direction) + # The index must be shifted by 1 because the first element + # of refusal_directions is the direction for the embeddings. + refusal_direction = refusal_directions[layer_index + 1] - for matrix in self.get_layer_matrices(layer_index): - # In-place subtraction is safe as we're not using Autograd. - matrix.sub_(weight * (projector @ matrix)) + # Projects any right-multiplied vector(s) onto the subspace + # spanned by the refusal direction. + projector = torch.outer(refusal_direction, refusal_direction) + + for matrix in matrices: + # In-place subtraction is safe as we're not using Autograd. + matrix.sub_(weight * (projector @ matrix)) def get_chat(self, prompt: str) -> list[dict[str, str]]: return [ diff --git a/src/heretic/utils.py b/src/heretic/utils.py index 65edcc1..6f0fdfb 100644 --- a/src/heretic/utils.py +++ b/src/heretic/utils.py @@ -54,28 +54,36 @@ def get_readme_intro( base_refusals: int, bad_prompts: list[str], ) -> str: + model_link = f"[{settings.model}](https://huggingface.co/{settings.model})" refusal_percentage = ( study.best_trial.user_attrs["refusals"] / len(bad_prompts) * 100 ) base_refusal_percentage = base_refusals / len(bad_prompts) * 100 - return f"""# This is a decensored version of [{settings.model}](https://huggingface.co/{settings.model}), made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")} + return f"""# This is a decensored version of { + model_link + }, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic")} ## Abliteration parameters -| Parameter | Value | -| :---------------------- | :--------------------------------------------: | -| **max_weight** | {study.best_params["max_weight"]:.4f} | -| **max_weight_position** | {study.best_params["max_weight_position"]:.4f} | -| **min_weight** | {study.best_params["min_weight"]:.4f} | -| **min_weight_distance** | {study.best_params["min_weight_distance"]:.4f} | +| Parameter | Value | +| :-------- | :---: | +{ + chr(10).join( + [ + f"| **{name}** | {value:.4f} |" + for name, value in study.best_params.items() + ] + ) + } ## Performance -| Metric | This model | Original model ([{settings.model}](https://huggingface.co/{settings.model})) | -| :---------------- | :------------------------------------------------: | :--------------------------------------------------------------------------: | -| **KL divergence** | {study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* | -| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % | +| Metric | This model | Original model ({model_link}) | +| :----- | :--------: | :---------------------------: | +| **KL divergence** | { + study.best_trial.user_attrs["kl_divergence"]:.4f} | 0 *(by definition)* | +| **Refusals** | {refusal_percentage:.1f} % | {base_refusal_percentage:.1f} % | -----