From 515a7b9eb502c42e7d59736702b05e70ef735249 Mon Sep 17 00:00:00 2001 From: cpagac <150854443+cpagac@users.noreply.github.com> Date: Fri, 13 Mar 2026 00:51:23 -0500 Subject: [PATCH] fix: prevent div-by-zero in evaluator when base_refusals is 0 (#225) * fix: prevent div-by-zero in evaluator when base_refusals is 0 When a model refuses all prompts from the start, base_refusals is 0. Return refusals directly in that case so ablations that introduce new refusals are still penalized correctly. * fix: cast refusals to float for type consistency" before hitting commit changes Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/heretic/evaluator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/heretic/evaluator.py b/src/heretic/evaluator.py index f2a8a25..eced014 100644 --- a/src/heretic/evaluator.py +++ b/src/heretic/evaluator.py @@ -110,7 +110,9 @@ class Evaluator: kl_divergence_scale = self.settings.kl_divergence_scale kl_divergence_target = self.settings.kl_divergence_target - refusals_score = refusals / self.base_refusals + refusals_score = ( + refusals / self.base_refusals if self.base_refusals > 0 else float(refusals) + ) if kl_divergence >= kl_divergence_target: kld_score = kl_divergence / kl_divergence_scale