feat: add option to plot residual vectors

This commit is contained in:
Philipp Emanuel Weidmann
2025-12-04 14:22:29 +05:30
parent d836fb2da9
commit eeb28b28c1
6 changed files with 1170 additions and 49 deletions
+18 -2
View File
@@ -24,8 +24,20 @@ max_batch_size = 128
# Maximum number of tokens to generate for each response.
max_response_length = 100
# Whether to print detailed information about residuals and refusal directions after calculating them.
print_refusal_geometry = false
# Whether to print detailed information about residuals and refusal directions.
print_residual_geometry = false
# Whether to generate plots showing PaCMAP projections of residual vectors.
plot_residuals = false
# Base path to save plots of residual vectors to.
residual_plot_path = "plots"
# Title placed above plots of residual vectors.
residual_plot_title = 'PaCMAP Projection of Residual Vectors for "Harmless" and "Harmful" Prompts'
# Matplotlib style sheet to use for plots of residual vectors.
residual_plot_style = "dark_background"
# Assumed "typical" value of the Kullback-Leibler divergence from the original model for abliterated models.
# This is used to ensure balanced co-optimization of KL divergence and refusal count.
@@ -81,12 +93,16 @@ system_prompt = "You are a helpful assistant."
dataset = "mlabonne/harmless_alpaca"
split = "train[:400]"
column = "text"
residual_plot_label = '"Harmless" prompts'
residual_plot_color = "royalblue"
# Dataset of prompts that tend to result in refusals (used for calculating refusal directions).
[bad_prompts]
dataset = "mlabonne/harmful_behaviors"
split = "train[:400]"
column = "text"
residual_plot_label = '"Harmful" prompts'
residual_plot_color = "darkorange"
# Dataset of prompts that tend to not result in refusals (used for evaluating model performance).
[good_evaluation_prompts]