From 83cbf0612ac36d43a985b795640e2cf2cdd1c34c Mon Sep 17 00:00:00 2001 From: Philipp Emanuel Weidmann Date: Sat, 22 Nov 2025 13:18:54 +0530 Subject: [PATCH] Add option to print refusal geometry --- config.default.toml | 3 +++ src/heretic/config.py | 5 +++++ src/heretic/main.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/config.default.toml b/config.default.toml index fdfedb9..5193e22 100644 --- a/config.default.toml +++ b/config.default.toml @@ -24,6 +24,9 @@ max_batch_size = 128 # Maximum number of tokens to generate for each response. max_response_length = 100 +# Whether to print detailed information about residuals and refusal directions after calculating them. +print_refusal_geometry = false + # Assumed "typical" value of the Kullback-Leibler divergence from the original model for abliterated models. # This is used to ensure balanced co-optimization of KL divergence and refusal count. kl_divergence_scale = 1.0 diff --git a/src/heretic/config.py b/src/heretic/config.py index c99248b..efd2354 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -61,6 +61,11 @@ class Settings(BaseSettings): description="Maximum number of tokens to generate for each response.", ) + print_refusal_geometry: bool = Field( + default=False, + description="Whether to print detailed information about residuals and refusal directions after calculating them.", + ) + kl_divergence_scale: float = Field( default=1.0, description=( diff --git a/src/heretic/main.py b/src/heretic/main.py index f11ae22..84969c3 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -13,6 +13,7 @@ import huggingface_hub import optuna import questionary import torch +import torch.linalg as LA import torch.nn.functional as F import transformers from accelerate.utils import ( @@ -29,6 +30,7 @@ from optuna.samplers import TPESampler from optuna.study import StudyDirection from pydantic import ValidationError from questionary import Choice, Style +from rich.table import Table from rich.traceback import install from .config import Settings @@ -204,6 +206,49 @@ def run(): p=2, dim=1, ) + + if settings.print_refusal_geometry: + table = Table() + table.add_column("Layer", justify="right") + table.add_column("S(g,b)", justify="right") + table.add_column("S(g,r)", justify="right") + table.add_column("S(b,r)", justify="right") + table.add_column("|g|", justify="right") + table.add_column("|b|", justify="right") + table.add_column("|r|", justify="right") + + g = good_residuals.mean(dim=0) + b = bad_residuals.mean(dim=0) + r = b - g + + g_b_similarities = F.cosine_similarity(g, b, dim=-1) + g_r_similarities = F.cosine_similarity(g, r, dim=-1) + b_r_similarities = F.cosine_similarity(b, r, dim=-1) + + g_norms = LA.vector_norm(g, dim=-1) + b_norms = LA.vector_norm(b, dim=-1) + r_norms = LA.vector_norm(r, dim=-1) + + for layer_index in range(len(model.get_layers()) + 1): + table.add_row( + "embed" if layer_index == 0 else str(layer_index), + f"{g_b_similarities[layer_index].item():.4f}", + f"{g_r_similarities[layer_index].item():.4f}", + f"{b_r_similarities[layer_index].item():.4f}", + f"{g_norms[layer_index].item():.2f}", + f"{b_norms[layer_index].item():.2f}", + f"{r_norms[layer_index].item():.2f}", + ) + + print() + print("[bold]Refusal Geometry[/]") + print(table) + print("[bold]g[/] = mean residual vector for good prompts") + print("[bold]b[/] = mean residual vector for bad prompts") + print("[bold]r[/] = refusal direction (i.e., [bold]b - g[/])") + print("[bold]S(x,y)[/] = cosine similarity of [bold]x[/] and [bold]y[/]") + print("[bold]|x|[/] = L2 norm of [bold]x[/]") + # We don't need the residuals after computing refusal directions. del good_residuals, bad_residuals empty_cache()