Add option to print refusal geometry
This commit is contained in:
@@ -24,6 +24,9 @@ max_batch_size = 128
|
||||
# Maximum number of tokens to generate for each response.
|
||||
max_response_length = 100
|
||||
|
||||
# Whether to print detailed information about residuals and refusal directions after calculating them.
|
||||
print_refusal_geometry = false
|
||||
|
||||
# Assumed "typical" value of the Kullback-Leibler divergence from the original model for abliterated models.
|
||||
# This is used to ensure balanced co-optimization of KL divergence and refusal count.
|
||||
kl_divergence_scale = 1.0
|
||||
|
||||
@@ -61,6 +61,11 @@ class Settings(BaseSettings):
|
||||
description="Maximum number of tokens to generate for each response.",
|
||||
)
|
||||
|
||||
print_refusal_geometry: bool = Field(
|
||||
default=False,
|
||||
description="Whether to print detailed information about residuals and refusal directions after calculating them.",
|
||||
)
|
||||
|
||||
kl_divergence_scale: float = Field(
|
||||
default=1.0,
|
||||
description=(
|
||||
|
||||
@@ -13,6 +13,7 @@ import huggingface_hub
|
||||
import optuna
|
||||
import questionary
|
||||
import torch
|
||||
import torch.linalg as LA
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from accelerate.utils import (
|
||||
@@ -29,6 +30,7 @@ from optuna.samplers import TPESampler
|
||||
from optuna.study import StudyDirection
|
||||
from pydantic import ValidationError
|
||||
from questionary import Choice, Style
|
||||
from rich.table import Table
|
||||
from rich.traceback import install
|
||||
|
||||
from .config import Settings
|
||||
@@ -204,6 +206,49 @@ def run():
|
||||
p=2,
|
||||
dim=1,
|
||||
)
|
||||
|
||||
if settings.print_refusal_geometry:
|
||||
table = Table()
|
||||
table.add_column("Layer", justify="right")
|
||||
table.add_column("S(g,b)", justify="right")
|
||||
table.add_column("S(g,r)", justify="right")
|
||||
table.add_column("S(b,r)", justify="right")
|
||||
table.add_column("|g|", justify="right")
|
||||
table.add_column("|b|", justify="right")
|
||||
table.add_column("|r|", justify="right")
|
||||
|
||||
g = good_residuals.mean(dim=0)
|
||||
b = bad_residuals.mean(dim=0)
|
||||
r = b - g
|
||||
|
||||
g_b_similarities = F.cosine_similarity(g, b, dim=-1)
|
||||
g_r_similarities = F.cosine_similarity(g, r, dim=-1)
|
||||
b_r_similarities = F.cosine_similarity(b, r, dim=-1)
|
||||
|
||||
g_norms = LA.vector_norm(g, dim=-1)
|
||||
b_norms = LA.vector_norm(b, dim=-1)
|
||||
r_norms = LA.vector_norm(r, dim=-1)
|
||||
|
||||
for layer_index in range(len(model.get_layers()) + 1):
|
||||
table.add_row(
|
||||
"embed" if layer_index == 0 else str(layer_index),
|
||||
f"{g_b_similarities[layer_index].item():.4f}",
|
||||
f"{g_r_similarities[layer_index].item():.4f}",
|
||||
f"{b_r_similarities[layer_index].item():.4f}",
|
||||
f"{g_norms[layer_index].item():.2f}",
|
||||
f"{b_norms[layer_index].item():.2f}",
|
||||
f"{r_norms[layer_index].item():.2f}",
|
||||
)
|
||||
|
||||
print()
|
||||
print("[bold]Refusal Geometry[/]")
|
||||
print(table)
|
||||
print("[bold]g[/] = mean residual vector for good prompts")
|
||||
print("[bold]b[/] = mean residual vector for bad prompts")
|
||||
print("[bold]r[/] = refusal direction (i.e., [bold]b - g[/])")
|
||||
print("[bold]S(x,y)[/] = cosine similarity of [bold]x[/] and [bold]y[/]")
|
||||
print("[bold]|x|[/] = L2 norm of [bold]x[/]")
|
||||
|
||||
# We don't need the residuals after computing refusal directions.
|
||||
del good_residuals, bad_residuals
|
||||
empty_cache()
|
||||
|
||||
Reference in New Issue
Block a user