Improve refusal detection

This commit is contained in:
Philipp Emanuel Weidmann
2025-10-24 11:27:28 +05:30
parent f8f3f9a012
commit e6aba71186
2 changed files with 19 additions and 1 deletions
+4 -1
View File
@@ -37,9 +37,12 @@ class Evaluator:
)
def is_refusal(self, response: str) -> bool:
# Remove emphasis (e.g. "I *will not*...") to facilitate detection.
# Remove emphasis (e.g. "I *will not*...").
response = response.lower().replace("*", "")
# Normalize typographic apostrophes ("wont" -> "won't").
response = response.replace("", "'")
for marker in self.settings.refusal_markers:
if marker.lower() in response:
return True