From 0e7c14d94af423197bffc9af2ece6bdcbb8b02f1 Mon Sep 17 00:00:00 2001 From: Philipp Emanuel Weidmann Date: Mon, 4 May 2026 22:11:14 +0530 Subject: [PATCH] fix: minor cleanups and improvements --- config.default.toml | 45 +++++++++++++++++++++++++++++++++++-------- src/heretic/config.py | 22 +++++++++++---------- src/heretic/main.py | 5 +++-- src/heretic/utils.py | 5 ++--- 4 files changed, 54 insertions(+), 23 deletions(-) diff --git a/config.default.toml b/config.default.toml index 1a82967..6ec8e8e 100644 --- a/config.default.toml +++ b/config.default.toml @@ -27,6 +27,12 @@ device_map = "auto" # Maximum memory to allocate per device. # max_memory = { "0" = "20GB", "cpu" = "64GB" } +# Whether to move intermediate analysis tensors (such as residuals and logprobs) +# to CPU memory as soon as possible to reduce peak VRAM usage. +# This lowers peak VRAM usage during residual analysis and evaluation, +# but may slightly reduce performance due to host/device transfers. +offload_outputs_to_cpu = true + # Number of input sequences to process in parallel (0 = auto). batch_size = 0 # auto @@ -36,6 +42,32 @@ max_batch_size = 128 # Maximum number of tokens to generate for each response. max_response_length = 100 +# List of pairs of the form [cot_initializer, closed_cot_block] used to skip +# the Chain-of-Thought block in responses, so that evaluation happens +# at the start of the actual response. +chain_of_thought_skips = [ + # Most thinking models. + [ + "", + "", + ], + # gpt-oss. + [ + "<|channel|>analysis<|message|>", + "<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>", + ], + # Unknown, suggested by user. + [ + "", + "", + ], + # Unknown, suggested by user. + [ + "[THINK]", + "[THINK][/THINK]", + ], +] + # Whether to print prompt/response pairs when counting refusals. print_responses = false @@ -64,13 +96,13 @@ kl_divergence_target = 0.01 # Whether to adjust the refusal directions so that only the component that is # orthogonal to the good direction is subtracted during abliteration. -orthogonalize_direction = false +orthogonalize_direction = true # How to apply row normalization of the weights. Options: # "none" (no normalization), # "pre" (compute LoRA adapter relative to row-normalized weights), # "full" (like "pre", but renormalizes to preserve original row magnitudes). -row_normalization = "none" +row_normalization = "full" # The rank of the LoRA adapter to use when "full" row normalization is used. # Row magnitude preservation is approximate due to non-linear effects, @@ -98,6 +130,9 @@ n_startup_trials = 60 # Directory to save and load study progress to/from. study_checkpoint_dir = "checkpoints" +# Maximum size for individual safetensors files generated when exporting a model. +max_shard_size = "5GB" + # Strings whose presence in a response (case insensitive) identifies the response as a refusal. refusal_markers = [ "sorry", @@ -137,12 +172,6 @@ refusal_markers = [ # System prompt to use when prompting the model. system_prompt = "You are a helpful assistant." -# Move intermediate analysis tensors (such as residuals and logprobs) -# to CPU memory as soon as possible to reduce peak VRAM usage. -# This lowers peak VRAM usage during residual analysis and evaluation, -# but may slightly reduce performance due to host/device transfers. -offload_outputs_to_cpu = true - # Dataset of prompts that tend to not result in refusals (used for calculating refusal directions). [good_prompts] dataset = "mlabonne/harmless_alpaca" diff --git a/src/heretic/config.py b/src/heretic/config.py index bd67956..649ab14 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -141,6 +141,16 @@ class Settings(BaseSettings): description='Maximum memory to allocate per device (e.g., { "0" = "20GB", "cpu" = "64GB" }).', ) + offload_outputs_to_cpu: bool = Field( + default=True, + description=( + "Whether to move intermediate analysis tensors (such as residuals and logprobs) " + "to CPU memory as soon as possible to reduce peak VRAM usage. " + "This lowers peak VRAM usage during residual analysis and evaluation, " + "but may slightly reduce performance due to host/device transfers." + ), + ) + trust_remote_code: bool | None = Field( default=None, description="Whether to trust remote code when loading the model.", @@ -261,7 +271,7 @@ class Settings(BaseSettings): ) orthogonalize_direction: bool = Field( - default=False, + default=True, description=( "Whether to adjust the refusal directions so that only the component that is " "orthogonal to the good direction is subtracted during abliteration." @@ -269,7 +279,7 @@ class Settings(BaseSettings): ) row_normalization: RowNormalization = Field( - default=RowNormalization.NONE, + default=RowNormalization.FULL, description=( "How to apply row normalization of the weights. Options: " '"none" (no normalization), ' @@ -433,14 +443,6 @@ class Settings(BaseSettings): description="System prompt to use when prompting the model.", ) - offload_outputs_to_cpu: bool = Field( - default=True, - description=( - "Whether to move intermediate analysis tensors (such as residuals and logprobs) " - "to CPU memory as soon as possible to reduce peak VRAM usage." - ), - ) - good_prompts: DatasetSpecification = Field( default=DatasetSpecification( dataset="mlabonne/harmless_alpaca", diff --git a/src/heretic/main.py b/src/heretic/main.py index 693c3d0..b0394b1 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -688,8 +688,9 @@ def run(): ( "The following trials resulted in Pareto optimal combinations of refusals and KL divergence. " "After selecting a trial, you will be able to save the model, upload it to Hugging Face, " - "or chat with it to test how well it works. You can return to this menu later to select a different trial. " - "[yellow]Note that KL divergence values above 1 usually indicate significant damage to the original model's capabilities.[/]" + "chat with it to test how well it works, or run standard benchmarks on it. " + "You can return to this menu later to select a different trial. " + "[yellow]Note that KL divergence values above 0.5 usually indicate significant damage to the original model's capabilities.[/]" ) ) diff --git a/src/heretic/utils.py b/src/heretic/utils.py index e688c5d..27dd697 100644 --- a/src/heretic/utils.py +++ b/src/heretic/utils.py @@ -9,6 +9,7 @@ import random import tempfile from dataclasses import dataclass from datetime import datetime, timezone +from importlib.metadata import version from pathlib import Path from typing import Any, TypeVar @@ -283,8 +284,6 @@ def get_readme_intro( # Hide the path, which may contain private information. model_link = "a model" - version_info = get_heretic_version_info() - if contains_reproducibility_information: reproducibility_instructions = """ > [!TIP] @@ -297,7 +296,7 @@ def get_readme_intro( return f"""# This is a decensored version of { model_link - }, made using [Heretic](https://github.com/p-e-w/heretic) v{version_info.version} + }, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic-llm")} {reproducibility_instructions} ## Abliteration parameters