fix: improve the reproducibility system (#303)

* fix: various cleanups and improvements for the reproducibility system

* fix: save only essential settings

* fix: improve model commit handling

* feat: make including system information optional

* fix: improve formatting of reproducibility README

* fix: fix remaining issues
This commit is contained in:
Philipp Emanuel Weidmann
2026-04-23 19:08:18 +05:30
committed by GitHub
parent c4d6a62aad
commit 513e3acc72
6 changed files with 467 additions and 337 deletions
-4
View File
@@ -150,7 +150,6 @@ split = "train[:400]"
column = "text"
residual_plot_label = '"Harmless" prompts'
residual_plot_color = "royalblue"
commit = ""
# Dataset of prompts that tend to result in refusals (used for calculating refusal directions).
[bad_prompts]
@@ -159,18 +158,15 @@ split = "train[:400]"
column = "text"
residual_plot_label = '"Harmful" prompts'
residual_plot_color = "darkorange"
commit = ""
# Dataset of prompts that tend to not result in refusals (used for evaluating model performance).
[good_evaluation_prompts]
dataset = "mlabonne/harmless_alpaca"
split = "test[:100]"
column = "text"
commit = ""
# Dataset of prompts that tend to result in refusals (used for evaluating model performance).
[bad_evaluation_prompts]
dataset = "mlabonne/harmful_behaviors"
split = "test[:100]"
column = "text"
commit = ""