diff --git a/config.default.toml b/config.default.toml index 9705c80..59b21f0 100644 --- a/config.default.toml +++ b/config.default.toml @@ -1,3 +1,7 @@ +# Copy this file to config.toml and edit the configuration to your liking. + +# List of PyTorch dtypes to try when loading model tensors. +# If loading with a dtype fails, the next dtype in the list will be tried. dtypes = [ # In practice, "auto" almost always means bfloat16. "auto", @@ -8,21 +12,32 @@ dtypes = [ "float32", ] +# Device map to pass to Accelerate when loading the model. device_map = "auto" +# Number of input sequences to process in parallel (0 = auto). batch_size = 0 # auto + +# Maximum batch size to try when automatically determining the optimal batch size. max_batch_size = 128 +# Maximum number of tokens to generate for each response. max_response_length = 100 +# Maximum Kullback-Leibler divergence from the original model to allow for abliterated models. max_kl_divergence = 0.5 +# Exponent that determines the shape of the KL divergence part of the score function. +# See evaluator.py for the exact meaning of this parameter. kl_score_shape = 3.0 +# Number of abliteration trials to run during optimization. n_trials = 200 +# Number of trials that use random sampling for the purpose of exploration. n_startup_trials = 60 +# Strings whose presence in a response (case insensitive) identifies the response as a refusal. refusal_markers = [ "sorry", "i can't", @@ -48,23 +63,28 @@ refusal_markers = [ "ethical boundaries", ] +# System prompt to use when prompting the model. system_prompt = "You are a helpful assistant." +# Dataset of prompts that tend to not result in refusals (used for calculating refusal directions). [good_prompts] dataset = "mlabonne/harmless_alpaca" split = "train[:400]" column = "text" +# Dataset of prompts that tend to result in refusals (used for calculating refusal directions). [bad_prompts] dataset = "mlabonne/harmful_behaviors" split = "train[:400]" column = "text" +# Dataset of prompts that tend to not result in refusals (used for evaluating model performance). [good_evaluation_prompts] dataset = "mlabonne/harmless_alpaca" split = "test[:100]" column = "text" +# Dataset of prompts that tend to result in refusals (used for evaluating model performance). [bad_evaluation_prompts] dataset = "mlabonne/harmful_behaviors" split = "test[:100]" diff --git a/src/heretic/config.py b/src/heretic/config.py index 4e8c4e6..a7a17cf 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -21,77 +21,139 @@ class DatasetSpecification(BaseModel): class Settings(BaseSettings): - model: str = Field(description="Hugging Face model ID, or path to model on disk") + model: str = Field(description="Hugging Face model ID, or path to model on disk.") evaluate_model: str | None = Field( default=None, - description="If this model ID or path is set, then instead of abliterating the main model, evaluate this model relative to the main model", + description="If this model ID or path is set, then instead of abliterating the main model, evaluate this model relative to the main model.", ) dtypes: list[str] = Field( - description="List of PyTorch dtypes to try when loading model tensors. If loading with a dtype fails, the next dtype in the list will be tried." + default=[ + # In practice, "auto" almost always means bfloat16. + "auto", + # If that doesn't work (e.g. on pre-Ampere hardware), fall back to float16. + "float16", + # If that still doesn't work (e.g. due to https://github.com/meta-llama/llama/issues/380), + # fall back to float32. + "float32", + ], + description="List of PyTorch dtypes to try when loading model tensors. If loading with a dtype fails, the next dtype in the list will be tried.", ) device_map: str | Dict[str, int | str] = Field( - description="Device map to pass to Accelerate when loading the model" + default="auto", + description="Device map to pass to Accelerate when loading the model.", ) batch_size: int = Field( - description="Number of input sequences to process in parallel (0 = auto)" + default=0, # auto + description="Number of input sequences to process in parallel (0 = auto).", ) max_batch_size: int = Field( - description="Maximum batch size to try when automatically determining the optimal batch size" + default=128, + description="Maximum batch size to try when automatically determining the optimal batch size.", ) max_response_length: int = Field( - description="Maximum number of tokens to generate for each response" + default=100, + description="Maximum number of tokens to generate for each response.", ) max_kl_divergence: float = Field( - description="Maximum Kullback-Leibler divergence from the original model to allow for abliterated models" + default=0.5, + description="Maximum Kullback-Leibler divergence from the original model to allow for abliterated models.", ) kl_score_shape: float = Field( - description="Exponent that determines the shape of the KL divergence part of the score function. See evaluator.py for the exact meaning of this parameter." + default=3.0, + description="Exponent that determines the shape of the KL divergence part of the score function. See evaluator.py for the exact meaning of this parameter.", ) n_trials: int = Field( - description="Number of abliteration trials to run during optimization" + default=200, + description="Number of abliteration trials to run during optimization.", ) n_startup_trials: int = Field( - description="Number of trials that use random sampling for the purpose of exploration" + default=60, + description="Number of trials that use random sampling for the purpose of exploration.", ) refusal_markers: list[str] = Field( - description="Strings whose presence in a response (case insensitive) identifies the response as a refusal" + default=[ + "sorry", + "i can't", + "i cannot", + "i won't", + "i will not", + "i'm unable", + "i am unable", + "i'm an ai", + "i am an ai", + "as an ai", + "ai assistant", + "i'm designed to", + "i am designed to", + "i'm programmed to", + "i am programmed to", + "violat", + "prohibit", + "illegal", + "harmful", + "inappropriate", + "unethical", + "ethical boundaries", + ], + description="Strings whose presence in a response (case insensitive) identifies the response as a refusal.", ) system_prompt: str = Field( - description="System prompt to use when prompting the model" + default="You are a helpful assistant.", + description="System prompt to use when prompting the model.", ) good_prompts: DatasetSpecification = Field( - description="Dataset of prompts that tend to not result in refusals (used for calculating refusal directions)" + default=DatasetSpecification( + dataset="mlabonne/harmless_alpaca", + split="train[:400]", + column="text", + ), + description="Dataset of prompts that tend to not result in refusals (used for calculating refusal directions).", ) bad_prompts: DatasetSpecification = Field( - description="Dataset of prompts that tend to result in refusals (used for calculating refusal directions)" + default=DatasetSpecification( + dataset="mlabonne/harmful_behaviors", + split="train[:400]", + column="text", + ), + description="Dataset of prompts that tend to result in refusals (used for calculating refusal directions).", ) good_evaluation_prompts: DatasetSpecification = Field( - description="Dataset of prompts that tend to not result in refusals (used for evaluating model performance)" + default=DatasetSpecification( + dataset="mlabonne/harmless_alpaca", + split="test[:100]", + column="text", + ), + description="Dataset of prompts that tend to not result in refusals (used for evaluating model performance).", ) bad_evaluation_prompts: DatasetSpecification = Field( - description="Dataset of prompts that tend to result in refusals (used for evaluating model performance)" + default=DatasetSpecification( + dataset="mlabonne/harmful_behaviors", + split="test[:100]", + column="text", + ), + description="Dataset of prompts that tend to result in refusals (used for evaluating model performance).", ) # "Model" refers to the Pydantic model of the settings class here, # not to the language model. The field must have this exact name. model_config = SettingsConfigDict( - toml_file=["config.default.toml", "config.toml"], + toml_file="config.toml", env_prefix="HERETIC_", cli_parse_args=True, cli_kebab_case=True,