diff --git a/config.default.toml b/config.default.toml index e4af86f..abfa0fc 100644 --- a/config.default.toml +++ b/config.default.toml @@ -1,4 +1,5 @@ -# Copy this file to config.toml and edit the configuration to your liking. +# Rename this file to config.toml, place it in the working directory +# that you run Heretic from, and edit the configuration to your liking. # List of PyTorch dtypes to try when loading model tensors. # If loading with a dtype fails, the next dtype in the list will be tried. @@ -77,9 +78,11 @@ row_normalization = "none" # larger output files and may slow down evaluation. full_normalization_lora_rank = 3 -# The symmetric winsorization to apply to each layer of the per-prompt residuals, +# The symmetric winsorization to apply to the per-prompt, per-layer residual vectors, # expressed as the quantile to clamp to (between 0 and 1). Disabled by default. -# Example: winsorization_quantile = 0.95 applies a 95% winsorization. +# This can tame so-called "massive activations" that occur in some models. +# Example: winsorization_quantile = 0.95 computes the 0.95-quantile of the absolute values +# of the components, then clamps the magnitudes of all components to that quantile. winsorization_quantile = 1.0 # Number of abliteration trials to run during optimization. diff --git a/config.noslop.toml b/config.noslop.toml index 71980cd..0eae39b 100644 --- a/config.noslop.toml +++ b/config.noslop.toml @@ -1,4 +1,5 @@ -# Copy this file to config.toml and edit the configuration to your liking. +# Rename this file to config.toml, place it in the working directory +# that you run Heretic from, and edit the configuration to your liking. max_response_length = 300 diff --git a/src/heretic/config.py b/src/heretic/config.py index 39c3931..8ed3f80 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -207,9 +207,11 @@ class Settings(BaseSettings): winsorization_quantile: float = Field( default=1.0, description=( - "The symmetric winsorization to apply to each layer of the per-prompt residuals, " + "The symmetric winsorization to apply to the per-prompt, per-layer residual vectors, " "expressed as the quantile to clamp to (between 0 and 1). Disabled by default. " - "Example: winsorization_quantile = 0.95 applies a 95% winsorization." + 'This can tame so-called "massive activations" that occur in some models. ' + "Example: winsorization_quantile = 0.95 computes the 0.95-quantile of the absolute values " + "of the components, then clamps the magnitudes of all components to that quantile." ), )