Add option to print refusal geometry

This commit is contained in:
Philipp Emanuel Weidmann
2025-11-22 13:18:54 +05:30
parent c35f3031f8
commit 83cbf0612a
3 changed files with 53 additions and 0 deletions
+3
View File
@@ -24,6 +24,9 @@ max_batch_size = 128
# Maximum number of tokens to generate for each response. # Maximum number of tokens to generate for each response.
max_response_length = 100 max_response_length = 100
# Whether to print detailed information about residuals and refusal directions after calculating them.
print_refusal_geometry = false
# Assumed "typical" value of the Kullback-Leibler divergence from the original model for abliterated models. # Assumed "typical" value of the Kullback-Leibler divergence from the original model for abliterated models.
# This is used to ensure balanced co-optimization of KL divergence and refusal count. # This is used to ensure balanced co-optimization of KL divergence and refusal count.
kl_divergence_scale = 1.0 kl_divergence_scale = 1.0
+5
View File
@@ -61,6 +61,11 @@ class Settings(BaseSettings):
description="Maximum number of tokens to generate for each response.", description="Maximum number of tokens to generate for each response.",
) )
print_refusal_geometry: bool = Field(
default=False,
description="Whether to print detailed information about residuals and refusal directions after calculating them.",
)
kl_divergence_scale: float = Field( kl_divergence_scale: float = Field(
default=1.0, default=1.0,
description=( description=(
+45
View File
@@ -13,6 +13,7 @@ import huggingface_hub
import optuna import optuna
import questionary import questionary
import torch import torch
import torch.linalg as LA
import torch.nn.functional as F import torch.nn.functional as F
import transformers import transformers
from accelerate.utils import ( from accelerate.utils import (
@@ -29,6 +30,7 @@ from optuna.samplers import TPESampler
from optuna.study import StudyDirection from optuna.study import StudyDirection
from pydantic import ValidationError from pydantic import ValidationError
from questionary import Choice, Style from questionary import Choice, Style
from rich.table import Table
from rich.traceback import install from rich.traceback import install
from .config import Settings from .config import Settings
@@ -204,6 +206,49 @@ def run():
p=2, p=2,
dim=1, dim=1,
) )
if settings.print_refusal_geometry:
table = Table()
table.add_column("Layer", justify="right")
table.add_column("S(g,b)", justify="right")
table.add_column("S(g,r)", justify="right")
table.add_column("S(b,r)", justify="right")
table.add_column("|g|", justify="right")
table.add_column("|b|", justify="right")
table.add_column("|r|", justify="right")
g = good_residuals.mean(dim=0)
b = bad_residuals.mean(dim=0)
r = b - g
g_b_similarities = F.cosine_similarity(g, b, dim=-1)
g_r_similarities = F.cosine_similarity(g, r, dim=-1)
b_r_similarities = F.cosine_similarity(b, r, dim=-1)
g_norms = LA.vector_norm(g, dim=-1)
b_norms = LA.vector_norm(b, dim=-1)
r_norms = LA.vector_norm(r, dim=-1)
for layer_index in range(len(model.get_layers()) + 1):
table.add_row(
"embed" if layer_index == 0 else str(layer_index),
f"{g_b_similarities[layer_index].item():.4f}",
f"{g_r_similarities[layer_index].item():.4f}",
f"{b_r_similarities[layer_index].item():.4f}",
f"{g_norms[layer_index].item():.2f}",
f"{b_norms[layer_index].item():.2f}",
f"{r_norms[layer_index].item():.2f}",
)
print()
print("[bold]Refusal Geometry[/]")
print(table)
print("[bold]g[/] = mean residual vector for good prompts")
print("[bold]b[/] = mean residual vector for bad prompts")
print("[bold]r[/] = refusal direction (i.e., [bold]b - g[/])")
print("[bold]S(x,y)[/] = cosine similarity of [bold]x[/] and [bold]y[/]")
print("[bold]|x|[/] = L2 norm of [bold]x[/]")
# We don't need the residuals after computing refusal directions. # We don't need the residuals after computing refusal directions.
del good_residuals, bad_residuals del good_residuals, bad_residuals
empty_cache() empty_cache()