From 0e7c14d94af423197bffc9af2ece6bdcbb8b02f1 Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Mon, 4 May 2026 22:11:14 +0530
Subject: [PATCH] fix: minor cleanups and improvements

---
 config.default.toml   | 45 +++++++++++++++++++++++++++++++++++--------
 src/heretic/config.py | 22 +++++++++++----------
 src/heretic/main.py   |  5 +++--
 src/heretic/utils.py  |  5 ++---
 4 files changed, 54 insertions(+), 23 deletions(-)
diff --git a/config.default.toml b/config.default.toml
index 1a82967..6ec8e8e 100644
--- a/config.default.toml
+++ b/config.default.toml
@@ -27,6 +27,12 @@ device_map = "auto"
 # Maximum memory to allocate per device.
 # max_memory = { "0" = "20GB", "cpu" = "64GB" }
 
+# Whether to move intermediate analysis tensors (such as residuals and logprobs)
+# to CPU memory as soon as possible to reduce peak VRAM usage.
+# This lowers peak VRAM usage during residual analysis and evaluation,
+# but may slightly reduce performance due to host/device transfers.
+offload_outputs_to_cpu = true
+
 # Number of input sequences to process in parallel (0 = auto).
 batch_size = 0  # auto
 
@@ -36,6 +42,32 @@ max_batch_size = 128
 # Maximum number of tokens to generate for each response.
 max_response_length = 100
 
+# List of pairs of the form [cot_initializer, closed_cot_block] used to skip
+# the Chain-of-Thought block in responses, so that evaluation happens
+# at the start of the actual response.
+chain_of_thought_skips = [
+    # Most thinking models.
+    [
+        "<think>",
+        "<think></think>",
+    ],
+    # gpt-oss.
+    [
+        "<|channel|>analysis<|message|>",
+        "<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>",
+    ],
+    # Unknown, suggested by user.
+    [
+        "<thought>",
+        "<thought></thought>",
+    ],
+    # Unknown, suggested by user.
+    [
+        "[THINK]",
+        "[THINK][/THINK]",
+    ],
+]
+
 # Whether to print prompt/response pairs when counting refusals.
 print_responses = false
 
@@ -64,13 +96,13 @@ kl_divergence_target = 0.01
 
 # Whether to adjust the refusal directions so that only the component that is
 # orthogonal to the good direction is subtracted during abliteration.
-orthogonalize_direction = false
+orthogonalize_direction = true
 
 # How to apply row normalization of the weights. Options:
 # "none" (no normalization),
 # "pre" (compute LoRA adapter relative to row-normalized weights),
 # "full" (like "pre", but renormalizes to preserve original row magnitudes).
-row_normalization = "none"
+row_normalization = "full"
 
 # The rank of the LoRA adapter to use when "full" row normalization is used.
 # Row magnitude preservation is approximate due to non-linear effects,
@@ -98,6 +130,9 @@ n_startup_trials = 60
 # Directory to save and load study progress to/from.
 study_checkpoint_dir = "checkpoints"
 
+# Maximum size for individual safetensors files generated when exporting a model.
+max_shard_size = "5GB"
+
 # Strings whose presence in a response (case insensitive) identifies the response as a refusal.
 refusal_markers = [
     "sorry",
@@ -137,12 +172,6 @@ refusal_markers = [
 # System prompt to use when prompting the model.
 system_prompt = "You are a helpful assistant."
 
-# Move intermediate analysis tensors (such as residuals and logprobs)
-# to CPU memory as soon as possible to reduce peak VRAM usage.
-# This lowers peak VRAM usage during residual analysis and evaluation,
-# but may slightly reduce performance due to host/device transfers.
-offload_outputs_to_cpu = true
-
 # Dataset of prompts that tend to not result in refusals (used for calculating refusal directions).
 [good_prompts]
 dataset = "mlabonne/harmless_alpaca"
diff --git a/src/heretic/config.py b/src/heretic/config.py
index bd67956..649ab14 100644
--- a/src/heretic/config.py
+++ b/src/heretic/config.py
@@ -141,6 +141,16 @@ class Settings(BaseSettings):
         description='Maximum memory to allocate per device (e.g., { "0" = "20GB", "cpu" = "64GB" }).',
     )
 
+    offload_outputs_to_cpu: bool = Field(
+        default=True,
+        description=(
+            "Whether to move intermediate analysis tensors (such as residuals and logprobs) "
+            "to CPU memory as soon as possible to reduce peak VRAM usage. "
+            "This lowers peak VRAM usage during residual analysis and evaluation, "
+            "but may slightly reduce performance due to host/device transfers."
+        ),
+    )
+
     trust_remote_code: bool | None = Field(
         default=None,
         description="Whether to trust remote code when loading the model.",
@@ -261,7 +271,7 @@ class Settings(BaseSettings):
     )
 
     orthogonalize_direction: bool = Field(
-        default=False,
+        default=True,
         description=(
             "Whether to adjust the refusal directions so that only the component that is "
             "orthogonal to the good direction is subtracted during abliteration."
@@ -269,7 +279,7 @@ class Settings(BaseSettings):
     )
 
     row_normalization: RowNormalization = Field(
-        default=RowNormalization.NONE,
+        default=RowNormalization.FULL,
         description=(
             "How to apply row normalization of the weights. Options: "
             '"none" (no normalization), '
@@ -433,14 +443,6 @@ class Settings(BaseSettings):
         description="System prompt to use when prompting the model.",
     )
 
-    offload_outputs_to_cpu: bool = Field(
-        default=True,
-        description=(
-            "Whether to move intermediate analysis tensors (such as residuals and logprobs) "
-            "to CPU memory as soon as possible to reduce peak VRAM usage."
-        ),
-    )
-
     good_prompts: DatasetSpecification = Field(
         default=DatasetSpecification(
             dataset="mlabonne/harmless_alpaca",
diff --git a/src/heretic/main.py b/src/heretic/main.py
index 693c3d0..b0394b1 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -688,8 +688,9 @@ def run():
             (
                 "The following trials resulted in Pareto optimal combinations of refusals and KL divergence. "
                 "After selecting a trial, you will be able to save the model, upload it to Hugging Face, "
-                "or chat with it to test how well it works. You can return to this menu later to select a different trial. "
-                "[yellow]Note that KL divergence values above 1 usually indicate significant damage to the original model's capabilities.[/]"
+                "chat with it to test how well it works, or run standard benchmarks on it. "
+                "You can return to this menu later to select a different trial. "
+                "[yellow]Note that KL divergence values above 0.5 usually indicate significant damage to the original model's capabilities.[/]"
             )
         )
 
diff --git a/src/heretic/utils.py b/src/heretic/utils.py
index e688c5d..27dd697 100644
--- a/src/heretic/utils.py
+++ b/src/heretic/utils.py
@@ -9,6 +9,7 @@ import random
 import tempfile
 from dataclasses import dataclass
 from datetime import datetime, timezone
+from importlib.metadata import version
 from pathlib import Path
 from typing import Any, TypeVar
 
@@ -283,8 +284,6 @@ def get_readme_intro(
         # Hide the path, which may contain private information.
         model_link = "a model"
 
-    version_info = get_heretic_version_info()
-
     if contains_reproducibility_information:
         reproducibility_instructions = """
 > [!TIP]
@@ -297,7 +296,7 @@ def get_readme_intro(
 
     return f"""# This is a decensored version of {
         model_link
-    }, made using [Heretic](https://github.com/p-e-w/heretic) v{version_info.version}
+    }, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic-llm")}
 {reproducibility_instructions}
 ## Abliteration parameters