fix: minor cleanups and improvements

This commit is contained in:
Philipp Emanuel Weidmann
2026-05-04 22:11:14 +05:30
parent 02ce8ad079
commit 0e7c14d94a
4 changed files with 54 additions and 23 deletions
+37 -8
View File
@@ -27,6 +27,12 @@ device_map = "auto"
# Maximum memory to allocate per device. # Maximum memory to allocate per device.
# max_memory = { "0" = "20GB", "cpu" = "64GB" } # max_memory = { "0" = "20GB", "cpu" = "64GB" }
# Whether to move intermediate analysis tensors (such as residuals and logprobs)
# to CPU memory as soon as possible to reduce peak VRAM usage.
# This lowers peak VRAM usage during residual analysis and evaluation,
# but may slightly reduce performance due to host/device transfers.
offload_outputs_to_cpu = true
# Number of input sequences to process in parallel (0 = auto). # Number of input sequences to process in parallel (0 = auto).
batch_size = 0 # auto batch_size = 0 # auto
@@ -36,6 +42,32 @@ max_batch_size = 128
# Maximum number of tokens to generate for each response. # Maximum number of tokens to generate for each response.
max_response_length = 100 max_response_length = 100
# List of pairs of the form [cot_initializer, closed_cot_block] used to skip
# the Chain-of-Thought block in responses, so that evaluation happens
# at the start of the actual response.
chain_of_thought_skips = [
# Most thinking models.
[
"<think>",
"<think></think>",
],
# gpt-oss.
[
"<|channel|>analysis<|message|>",
"<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>",
],
# Unknown, suggested by user.
[
"<thought>",
"<thought></thought>",
],
# Unknown, suggested by user.
[
"[THINK]",
"[THINK][/THINK]",
],
]
# Whether to print prompt/response pairs when counting refusals. # Whether to print prompt/response pairs when counting refusals.
print_responses = false print_responses = false
@@ -64,13 +96,13 @@ kl_divergence_target = 0.01
# Whether to adjust the refusal directions so that only the component that is # Whether to adjust the refusal directions so that only the component that is
# orthogonal to the good direction is subtracted during abliteration. # orthogonal to the good direction is subtracted during abliteration.
orthogonalize_direction = false orthogonalize_direction = true
# How to apply row normalization of the weights. Options: # How to apply row normalization of the weights. Options:
# "none" (no normalization), # "none" (no normalization),
# "pre" (compute LoRA adapter relative to row-normalized weights), # "pre" (compute LoRA adapter relative to row-normalized weights),
# "full" (like "pre", but renormalizes to preserve original row magnitudes). # "full" (like "pre", but renormalizes to preserve original row magnitudes).
row_normalization = "none" row_normalization = "full"
# The rank of the LoRA adapter to use when "full" row normalization is used. # The rank of the LoRA adapter to use when "full" row normalization is used.
# Row magnitude preservation is approximate due to non-linear effects, # Row magnitude preservation is approximate due to non-linear effects,
@@ -98,6 +130,9 @@ n_startup_trials = 60
# Directory to save and load study progress to/from. # Directory to save and load study progress to/from.
study_checkpoint_dir = "checkpoints" study_checkpoint_dir = "checkpoints"
# Maximum size for individual safetensors files generated when exporting a model.
max_shard_size = "5GB"
# Strings whose presence in a response (case insensitive) identifies the response as a refusal. # Strings whose presence in a response (case insensitive) identifies the response as a refusal.
refusal_markers = [ refusal_markers = [
"sorry", "sorry",
@@ -137,12 +172,6 @@ refusal_markers = [
# System prompt to use when prompting the model. # System prompt to use when prompting the model.
system_prompt = "You are a helpful assistant." system_prompt = "You are a helpful assistant."
# Move intermediate analysis tensors (such as residuals and logprobs)
# to CPU memory as soon as possible to reduce peak VRAM usage.
# This lowers peak VRAM usage during residual analysis and evaluation,
# but may slightly reduce performance due to host/device transfers.
offload_outputs_to_cpu = true
# Dataset of prompts that tend to not result in refusals (used for calculating refusal directions). # Dataset of prompts that tend to not result in refusals (used for calculating refusal directions).
[good_prompts] [good_prompts]
dataset = "mlabonne/harmless_alpaca" dataset = "mlabonne/harmless_alpaca"
+12 -10
View File
@@ -141,6 +141,16 @@ class Settings(BaseSettings):
description='Maximum memory to allocate per device (e.g., { "0" = "20GB", "cpu" = "64GB" }).', description='Maximum memory to allocate per device (e.g., { "0" = "20GB", "cpu" = "64GB" }).',
) )
offload_outputs_to_cpu: bool = Field(
default=True,
description=(
"Whether to move intermediate analysis tensors (such as residuals and logprobs) "
"to CPU memory as soon as possible to reduce peak VRAM usage. "
"This lowers peak VRAM usage during residual analysis and evaluation, "
"but may slightly reduce performance due to host/device transfers."
),
)
trust_remote_code: bool | None = Field( trust_remote_code: bool | None = Field(
default=None, default=None,
description="Whether to trust remote code when loading the model.", description="Whether to trust remote code when loading the model.",
@@ -261,7 +271,7 @@ class Settings(BaseSettings):
) )
orthogonalize_direction: bool = Field( orthogonalize_direction: bool = Field(
default=False, default=True,
description=( description=(
"Whether to adjust the refusal directions so that only the component that is " "Whether to adjust the refusal directions so that only the component that is "
"orthogonal to the good direction is subtracted during abliteration." "orthogonal to the good direction is subtracted during abliteration."
@@ -269,7 +279,7 @@ class Settings(BaseSettings):
) )
row_normalization: RowNormalization = Field( row_normalization: RowNormalization = Field(
default=RowNormalization.NONE, default=RowNormalization.FULL,
description=( description=(
"How to apply row normalization of the weights. Options: " "How to apply row normalization of the weights. Options: "
'"none" (no normalization), ' '"none" (no normalization), '
@@ -433,14 +443,6 @@ class Settings(BaseSettings):
description="System prompt to use when prompting the model.", description="System prompt to use when prompting the model.",
) )
offload_outputs_to_cpu: bool = Field(
default=True,
description=(
"Whether to move intermediate analysis tensors (such as residuals and logprobs) "
"to CPU memory as soon as possible to reduce peak VRAM usage."
),
)
good_prompts: DatasetSpecification = Field( good_prompts: DatasetSpecification = Field(
default=DatasetSpecification( default=DatasetSpecification(
dataset="mlabonne/harmless_alpaca", dataset="mlabonne/harmless_alpaca",
+3 -2
View File
@@ -688,8 +688,9 @@ def run():
( (
"The following trials resulted in Pareto optimal combinations of refusals and KL divergence. " "The following trials resulted in Pareto optimal combinations of refusals and KL divergence. "
"After selecting a trial, you will be able to save the model, upload it to Hugging Face, " "After selecting a trial, you will be able to save the model, upload it to Hugging Face, "
"or chat with it to test how well it works. You can return to this menu later to select a different trial. " "chat with it to test how well it works, or run standard benchmarks on it. "
"[yellow]Note that KL divergence values above 1 usually indicate significant damage to the original model's capabilities.[/]" "You can return to this menu later to select a different trial. "
"[yellow]Note that KL divergence values above 0.5 usually indicate significant damage to the original model's capabilities.[/]"
) )
) )
+2 -3
View File
@@ -9,6 +9,7 @@ import random
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timezone from datetime import datetime, timezone
from importlib.metadata import version
from pathlib import Path from pathlib import Path
from typing import Any, TypeVar from typing import Any, TypeVar
@@ -283,8 +284,6 @@ def get_readme_intro(
# Hide the path, which may contain private information. # Hide the path, which may contain private information.
model_link = "a model" model_link = "a model"
version_info = get_heretic_version_info()
if contains_reproducibility_information: if contains_reproducibility_information:
reproducibility_instructions = """ reproducibility_instructions = """
> [!TIP] > [!TIP]
@@ -297,7 +296,7 @@ def get_readme_intro(
return f"""# This is a decensored version of { return f"""# This is a decensored version of {
model_link model_link
}, made using [Heretic](https://github.com/p-e-w/heretic) v{version_info.version} }, made using [Heretic](https://github.com/p-e-w/heretic) v{version("heretic-llm")}
{reproducibility_instructions} {reproducibility_instructions}
## Abliteration parameters ## Abliteration parameters