feat: add option to plot residual vectors
This commit is contained in:
+18
-2
@@ -24,8 +24,20 @@ max_batch_size = 128
|
||||
# Maximum number of tokens to generate for each response.
|
||||
max_response_length = 100
|
||||
|
||||
# Whether to print detailed information about residuals and refusal directions after calculating them.
|
||||
print_refusal_geometry = false
|
||||
# Whether to print detailed information about residuals and refusal directions.
|
||||
print_residual_geometry = false
|
||||
|
||||
# Whether to generate plots showing PaCMAP projections of residual vectors.
|
||||
plot_residuals = false
|
||||
|
||||
# Base path to save plots of residual vectors to.
|
||||
residual_plot_path = "plots"
|
||||
|
||||
# Title placed above plots of residual vectors.
|
||||
residual_plot_title = 'PaCMAP Projection of Residual Vectors for "Harmless" and "Harmful" Prompts'
|
||||
|
||||
# Matplotlib style sheet to use for plots of residual vectors.
|
||||
residual_plot_style = "dark_background"
|
||||
|
||||
# Assumed "typical" value of the Kullback-Leibler divergence from the original model for abliterated models.
|
||||
# This is used to ensure balanced co-optimization of KL divergence and refusal count.
|
||||
@@ -81,12 +93,16 @@ system_prompt = "You are a helpful assistant."
|
||||
dataset = "mlabonne/harmless_alpaca"
|
||||
split = "train[:400]"
|
||||
column = "text"
|
||||
residual_plot_label = '"Harmless" prompts'
|
||||
residual_plot_color = "royalblue"
|
||||
|
||||
# Dataset of prompts that tend to result in refusals (used for calculating refusal directions).
|
||||
[bad_prompts]
|
||||
dataset = "mlabonne/harmful_behaviors"
|
||||
split = "train[:400]"
|
||||
column = "text"
|
||||
residual_plot_label = '"Harmful" prompts'
|
||||
residual_plot_color = "darkorange"
|
||||
|
||||
# Dataset of prompts that tend to not result in refusals (used for evaluating model performance).
|
||||
[good_evaluation_prompts]
|
||||
|
||||
Reference in New Issue
Block a user