From 02a5237a0213f631370751550f283034042888f9 Mon Sep 17 00:00:00 2001 From: Philipp Emanuel Weidmann Date: Sat, 27 Dec 2025 14:48:29 +0530 Subject: [PATCH] feat: add option to print prompt/response pairs --- config.default.toml | 3 +++ src/heretic/config.py | 5 +++++ src/heretic/evaluator.py | 29 +++++++++++++++++++++++++++-- src/heretic/model.py | 10 ++++++++-- 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/config.default.toml b/config.default.toml index 61705f6..8a5efce 100644 --- a/config.default.toml +++ b/config.default.toml @@ -34,6 +34,9 @@ max_batch_size = 128 # Maximum number of tokens to generate for each response. max_response_length = 100 +# Whether to print prompt/response pairs when counting refusals. +print_responses = false + # Whether to print detailed information about residuals and refusal directions. print_residual_geometry = false diff --git a/src/heretic/config.py b/src/heretic/config.py index 2f02a65..fd2cf62 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -97,6 +97,11 @@ class Settings(BaseSettings): description="Maximum number of tokens to generate for each response.", ) + print_responses: bool = Field( + default=False, + description="Whether to print prompt/response pairs when counting refusals.", + ) + print_residual_geometry: bool = Field( default=False, description="Whether to print detailed information about residuals and refusal directions.", diff --git a/src/heretic/evaluator.py b/src/heretic/evaluator.py index 7306cbf..a3457a3 100644 --- a/src/heretic/evaluator.py +++ b/src/heretic/evaluator.py @@ -2,6 +2,7 @@ # Copyright (C) 2025 Philipp Emanuel Weidmann import torch.nn.functional as F +from torch import Tensor from .config import Settings from .model import Model @@ -9,6 +10,13 @@ from .utils import load_prompts, print class Evaluator: + settings: Settings + model: Model + good_prompts: list[str] + bad_prompts: list[str] + base_logprobs: Tensor + base_refusals: int + def __init__(self, settings: Settings, model: Model): self.settings = settings self.model = model @@ -57,9 +65,26 @@ class Evaluator: return False def count_refusals(self) -> int: + refusal_count = 0 + responses = self.model.get_responses_batched(self.bad_prompts) - refusals = [response for response in responses if self.is_refusal(response)] - return len(refusals) + + for prompt, response in zip(self.bad_prompts, responses): + is_refusal = self.is_refusal(response) + if is_refusal: + refusal_count += 1 + + if self.settings.print_responses: + print() + print(f"[bold]Prompt:[/] {prompt}") + print( + f"[bold]Response:[/] [{'red' if is_refusal else 'green'}]{response}[/]" + ) + + if self.settings.print_responses: + print() + + return refusal_count def get_score(self) -> tuple[tuple[float, float], float, int]: print(" * Obtaining first-token probability distributions...") diff --git a/src/heretic/model.py b/src/heretic/model.py index df93609..5058c14 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -512,13 +512,19 @@ class Model: max_new_tokens=self.settings.max_response_length, ) - # Return only the newly generated part. - return self.tokenizer.batch_decode( + responses = self.tokenizer.batch_decode( + # Extract the newly generated part. # This cast is valid because the input_ids property is a Tensor # if the tokenizer is invoked with return_tensors="pt", as above. outputs[:, cast(Tensor, inputs["input_ids"]).shape[1] :] ) + return [ + # Strip out pad tokens from batch generation. + response.replace(self.tokenizer.pad_token, "") + for response in responses + ] + def get_responses_batched(self, prompts: list[str]) -> list[str]: responses = []