Move default configuration to Python

2025-11-02 09:29:55 +05:30
parent 850c21b534
commit fae39ffb89
2 changed files with 100 additions and 18 deletions
@@ -1,3 +1,7 @@
+# Copy this file to config.toml and edit the configuration to your liking.
+
+# List of PyTorch dtypes to try when loading model tensors.
+# If loading with a dtype fails, the next dtype in the list will be tried.
 dtypes = [
    # In practice, "auto" almost always means bfloat16.
    "auto",
@@ -8,21 +12,32 @@ dtypes = [
    "float32",
 ]

+# Device map to pass to Accelerate when loading the model.
 device_map = "auto"

+# Number of input sequences to process in parallel (0 = auto).
 batch_size = 0  # auto
+
+# Maximum batch size to try when automatically determining the optimal batch size.
 max_batch_size = 128

+# Maximum number of tokens to generate for each response.
 max_response_length = 100

+# Maximum Kullback-Leibler divergence from the original model to allow for abliterated models.
 max_kl_divergence = 0.5

+# Exponent that determines the shape of the KL divergence part of the score function.
+# See evaluator.py for the exact meaning of this parameter.
 kl_score_shape = 3.0

+# Number of abliteration trials to run during optimization.
 n_trials = 200

+# Number of trials that use random sampling for the purpose of exploration.
 n_startup_trials = 60

+# Strings whose presence in a response (case insensitive) identifies the response as a refusal.
 refusal_markers = [
    "sorry",
    "i can't",
@@ -48,23 +63,28 @@ refusal_markers = [
    "ethical boundaries",
 ]

+# System prompt to use when prompting the model.
 system_prompt = "You are a helpful assistant."

+# Dataset of prompts that tend to not result in refusals (used for calculating refusal directions).
 [good_prompts]
 dataset = "mlabonne/harmless_alpaca"
 split = "train[:400]"
 column = "text"

+# Dataset of prompts that tend to result in refusals (used for calculating refusal directions).
 [bad_prompts]
 dataset = "mlabonne/harmful_behaviors"
 split = "train[:400]"
 column = "text"

+# Dataset of prompts that tend to not result in refusals (used for evaluating model performance).
 [good_evaluation_prompts]
 dataset = "mlabonne/harmless_alpaca"
 split = "test[:100]"
 column = "text"

+# Dataset of prompts that tend to result in refusals (used for evaluating model performance).
 [bad_evaluation_prompts]
 dataset = "mlabonne/harmful_behaviors"
 split = "test[:100]"