Implement Magnitude-Preserving Orthogonal Ablation (#52)
* feat: add support for winsorizing the residuals Adds setting winsorization_quantile, expressed as the quantile to clamp to. - If set to a value below 1, the residuals obtained from evaluating the first token of the good and bad prompts are winsorized - that is, values outside the given quantile are clamped. Note that winsorization_quantile = 0.95 corresponds to a 90% winsorization. * feat: implement magnitude-preserving orthogonal ablation Adds boolean setting orthogonalize_direction: - When enabled, only the component of the refusal directions that is orthogonal to the harmless direction is subtracted during abliteration. Adds enum-valued setting row_normalization: - 'none': No normalization. - 'pre': Row-normalize the weight matrix before computing the LoRA adapter. - 'full': Like 'pre', but re-normalizes to preserve original row magnitudes. * prefer 'good' and 'bad' over 'harmless' and 'harmful' * clarify how winsorization is applied * store and reuse full peft_config * remove unneeded cast * make LoRA rank configurable for full normalization * explain why the singular values are split across the components
This commit is contained in:
@@ -34,6 +34,22 @@ max_batch_size = 128
|
||||
# Maximum number of tokens to generate for each response.
|
||||
max_response_length = 100
|
||||
|
||||
# Whether to adjust the refusal directions so that only the component that is
|
||||
# orthogonal to the good direction is subtracted during abliteration.
|
||||
orthogonalize_direction = false
|
||||
|
||||
# How to apply row normalization of the weights. Options:
|
||||
# 'none' (no normalization),
|
||||
# 'pre' (compute LoRA adapter relative to row-normalized weights),
|
||||
# 'full' (like 'pre', but re-normalizes to preserve original row magnitudes).
|
||||
row_normalization = "none"
|
||||
|
||||
# The rank of the LoRA adapter to use when 'full' row normalization is used.
|
||||
# Row magnitude preservation is approximate due to non-linear efects,
|
||||
# and this determines the rank of that approximation. Higher ranks produce
|
||||
# larger output files and may slow down evaluation.
|
||||
full_normalization_lora_rank = 3
|
||||
|
||||
# Whether to print prompt/response pairs when counting refusals.
|
||||
print_responses = false
|
||||
|
||||
@@ -60,6 +76,11 @@ kl_divergence_scale = 1.0
|
||||
# This helps prevent the sampler from extensively exploring parameter combinations that "do nothing".
|
||||
kl_divergence_target = 0.01
|
||||
|
||||
# The symmetric winsorization to apply to each layer of the per-prompt residuals,
|
||||
# expressed as the quantile to clamp to (between 0 and 1). Disabled by default.
|
||||
# Example: winsorization_quantile = 0.95 applies a 90% winsorization.
|
||||
winsorization_quantile = 1.0
|
||||
|
||||
# Number of abliteration trials to run during optimization.
|
||||
n_trials = 200
|
||||
|
||||
|
||||
Reference in New Issue
Block a user