Implement Magnitude-Preserving Orthogonal Ablation (#52)

* feat: add support for winsorizing the residuals Adds setting winsorization_quantile, expressed as the quantile to clamp to. - If set to a value below 1, the residuals obtained from evaluating the first token of the good and bad prompts are winsorized - that is, values outside the given quantile are clamped. Note that winsorization_quantile = 0.95 corresponds to a 90% winsorization. * feat: implement magnitude-preserving orthogonal ablation Adds boolean setting orthogonalize_direction: - When enabled, only the component of the refusal directions that is orthogonal to the harmless direction is subtracted during abliteration. Adds enum-valued setting row_normalization: - 'none': No normalization. - 'pre': Row-normalize the weight matrix before computing the LoRA adapter. - 'full': Like 'pre', but re-normalizes to preserve original row magnitudes. * prefer 'good' and 'bad' over 'harmless' and 'harmful' * clarify how winsorization is applied * store and reuse full peft_config * remove unneeded cast * make LoRA rank configurable for full normalization * explain why the singular values are split across the components
2026-02-02 12:35:19 +01:00
parent 42f5a9b553
commit 3525b1ac22
4 changed files with 147 additions and 23 deletions
@@ -34,6 +34,22 @@ max_batch_size = 128
 # Maximum number of tokens to generate for each response.
 max_response_length = 100

+# Whether to adjust the refusal directions so that only the component that is
+# orthogonal to the good direction is subtracted during abliteration.
+orthogonalize_direction = false
+
+# How to apply row normalization of the weights. Options:
+# 'none' (no normalization),
+# 'pre' (compute LoRA adapter relative to row-normalized weights),
+# 'full' (like 'pre', but re-normalizes to preserve original row magnitudes).
+row_normalization = "none"
+
+# The rank of the LoRA adapter to use when 'full' row normalization is used.
+# Row magnitude preservation is approximate due to non-linear efects,
+# and this determines the rank of that approximation. Higher ranks produce
+# larger output files and may slow down evaluation.
+full_normalization_lora_rank = 3
+
 # Whether to print prompt/response pairs when counting refusals.
 print_responses = false

@@ -60,6 +76,11 @@ kl_divergence_scale = 1.0
 # This helps prevent the sampler from extensively exploring parameter combinations that "do nothing".
 kl_divergence_target = 0.01

+# The symmetric winsorization to apply to each layer of the per-prompt residuals,
+# expressed as the quantile to clamp to (between 0 and 1). Disabled by default.
+# Example: winsorization_quantile = 0.95 applies a 90% winsorization.
+winsorization_quantile = 1.0
+
 # Number of abliteration trials to run during optimization.
 n_trials = 200