From 02a5237a0213f631370751550f283034042888f9 Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Sat, 27 Dec 2025 14:48:29 +0530
Subject: [PATCH] feat: add option to print prompt/response pairs

---
 config.default.toml      |  3 +++
 src/heretic/config.py    |  5 +++++
 src/heretic/evaluator.py | 29 +++++++++++++++++++++++++++--
 src/heretic/model.py     | 10 ++++++++--
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/config.default.toml b/config.default.toml
index 61705f6..8a5efce 100644
--- a/config.default.toml
+++ b/config.default.toml
@@ -34,6 +34,9 @@ max_batch_size = 128
 # Maximum number of tokens to generate for each response.
 max_response_length = 100
 
+# Whether to print prompt/response pairs when counting refusals.
+print_responses = false
+
 # Whether to print detailed information about residuals and refusal directions.
 print_residual_geometry = false
 
diff --git a/src/heretic/config.py b/src/heretic/config.py
index 2f02a65..fd2cf62 100644
--- a/src/heretic/config.py
+++ b/src/heretic/config.py
@@ -97,6 +97,11 @@ class Settings(BaseSettings):
         description="Maximum number of tokens to generate for each response.",
     )
 
+    print_responses: bool = Field(
+        default=False,
+        description="Whether to print prompt/response pairs when counting refusals.",
+    )
+
     print_residual_geometry: bool = Field(
         default=False,
         description="Whether to print detailed information about residuals and refusal directions.",
diff --git a/src/heretic/evaluator.py b/src/heretic/evaluator.py
index 7306cbf..a3457a3 100644
--- a/src/heretic/evaluator.py
+++ b/src/heretic/evaluator.py
@@ -2,6 +2,7 @@
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>
 
 import torch.nn.functional as F
+from torch import Tensor
 
 from .config import Settings
 from .model import Model
@@ -9,6 +10,13 @@ from .utils import load_prompts, print
 
 
 class Evaluator:
+    settings: Settings
+    model: Model
+    good_prompts: list[str]
+    bad_prompts: list[str]
+    base_logprobs: Tensor
+    base_refusals: int
+
     def __init__(self, settings: Settings, model: Model):
         self.settings = settings
         self.model = model
@@ -57,9 +65,26 @@ class Evaluator:
         return False
 
     def count_refusals(self) -> int:
+        refusal_count = 0
+
         responses = self.model.get_responses_batched(self.bad_prompts)
-        refusals = [response for response in responses if self.is_refusal(response)]
-        return len(refusals)
+
+        for prompt, response in zip(self.bad_prompts, responses):
+            is_refusal = self.is_refusal(response)
+            if is_refusal:
+                refusal_count += 1
+
+            if self.settings.print_responses:
+                print()
+                print(f"[bold]Prompt:[/] {prompt}")
+                print(
+                    f"[bold]Response:[/] [{'red' if is_refusal else 'green'}]{response}[/]"
+                )
+
+        if self.settings.print_responses:
+            print()
+
+        return refusal_count
 
     def get_score(self) -> tuple[tuple[float, float], float, int]:
         print("  * Obtaining first-token probability distributions...")
diff --git a/src/heretic/model.py b/src/heretic/model.py
index df93609..5058c14 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -512,13 +512,19 @@ class Model:
             max_new_tokens=self.settings.max_response_length,
         )
 
-        # Return only the newly generated part.
-        return self.tokenizer.batch_decode(
+        responses = self.tokenizer.batch_decode(
+            # Extract the newly generated part.
             # This cast is valid because the input_ids property is a Tensor
             # if the tokenizer is invoked with return_tensors="pt", as above.
             outputs[:, cast(Tensor, inputs["input_ids"]).shape[1] :]
         )
 
+        return [
+            # Strip out pad tokens from batch generation.
+            response.replace(self.tokenizer.pad_token, "")
+            for response in responses
+        ]
+
     def get_responses_batched(self, prompts: list[str]) -> list[str]:
         responses = []