From 3f242369e07bfbe2075da1ea8d6590cbdfb5fd51 Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Tue, 23 Sep 2025 16:00:20 +0530
Subject: [PATCH] Add educated guesses for parameter values to get the
 optimizer started

---
 src/heretic/main.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/heretic/main.py b/src/heretic/main.py
index ea45b96..19bc3da 100644
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -82,6 +82,10 @@ def main():
         while batch_size <= settings.max_batch_size:
             print(f"* Trying batch size [bold]{batch_size}[/]... ", end="")
 
+            # FIXME: Using the same prompt across the batch is a poor benchmark for MoE models,
+            #        because it means that the same experts are active for all prompts at each
+            #        token position (since we use deterministic decoding), which is substantially
+            #        faster than if different experts must be accessed for each prompt.
             prompts = [settings.test_prompt] * batch_size
 
             try:
@@ -141,7 +145,7 @@ def main():
         )
         min_weight = trial.suggest_float("min_weight", 0, max_weight)
         min_weight_distance = trial.suggest_float(
-            "min_weight_distance", 0, len(model.model.model.layers) - 1
+            "min_weight_distance", 1, len(model.model.model.layers) - 1
         )
 
         print()
@@ -173,6 +177,26 @@ def main():
         return -score
 
     study = optuna.create_study()
+
+    # Educated guesses for parameter values to get the optimizer started.
+    for max_weight, max_weight_position, min_weight, min_weight_distance in [
+        (0.0, 0.0, 0.0, 0.5),
+        (1.0, 0.5, 0.0, 0.25),
+        (0.8, 0.7, 0.3, 0.4),
+        (0.9, 0.3, 0.1, 0.1),
+        (1.0, 1.0, 1.0, 1.0),
+    ]:
+        study.enqueue_trial(
+            {
+                "max_weight": max_weight,
+                "max_weight_position": max_weight_position
+                * (len(model.model.model.layers) - 1),
+                "min_weight": min_weight,
+                "min_weight_distance": min_weight_distance
+                * (len(model.model.model.layers) - 1),
+            }
+        )
+
     study.optimize(objective, n_trials=settings.n_trials)
 
     print()