Add educated guesses for parameter values to get the optimizer started

2025-09-23 16:00:20 +05:30
parent c447805fc2
commit 3f242369e0
1 changed files with 25 additions and 1 deletions
@@ -82,6 +82,10 @@ def main():
        while batch_size <= settings.max_batch_size:
            print(f"* Trying batch size [bold]{batch_size}[/]... ", end="")
            # FIXME: Using the same prompt across the batch is a poor benchmark for MoE models,
            #        because it means that the same experts are active for all prompts at each
            #        token position (since we use deterministic decoding), which is substantially
            #        faster than if different experts must be accessed for each prompt.
            prompts = [settings.test_prompt] * batch_size
            try:
@@ -141,7 +145,7 @@ def main():
        )
        min_weight = trial.suggest_float("min_weight", 0, max_weight)
        min_weight_distance = trial.suggest_float(
-            "min_weight_distance", 0, len(model.model.model.layers) - 1
+            "min_weight_distance", 1, len(model.model.model.layers) - 1
        )
        print()
@@ -173,6 +177,26 @@ def main():
        return -score
    study = optuna.create_study()
    # Educated guesses for parameter values to get the optimizer started.
    for max_weight, max_weight_position, min_weight, min_weight_distance in [
        (0.0, 0.0, 0.0, 0.5),
        (1.0, 0.5, 0.0, 0.25),
        (0.8, 0.7, 0.3, 0.4),
        (0.9, 0.3, 0.1, 0.1),
        (1.0, 1.0, 1.0, 1.0),
    ]:
        study.enqueue_trial(
            {
                "max_weight": max_weight,
                "max_weight_position": max_weight_position
                * (len(model.model.model.layers) - 1),
                "min_weight": min_weight,
                "min_weight_distance": min_weight_distance
                * (len(model.model.model.layers) - 1),
            }
        )
    study.optimize(objective, n_trials=settings.n_trials)
    print()