From ed65d6902b6b583b231c9e9e5eee570b13728f84 Mon Sep 17 00:00:00 2001 From: Philipp Emanuel Weidmann Date: Wed, 15 Oct 2025 17:51:39 +0530 Subject: [PATCH] Support gpt-oss MoE --- src/heretic/model.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/heretic/model.py b/src/heretic/model.py index ae24837..4655411 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -112,6 +112,14 @@ class Model: for expert in layer.block_sparse_moe.experts: try_add(expert.w2.weight) + # gpt-oss MoE. + if not matrices: + with suppress(Exception): + # The implementation of gpt-oss in Transformers differs from many other MoE models + # in that it stores the down-projections for all experts in a single 3D tensor, + # but thanks to PyTorch's broadcasting magic, it all just works anyway. + try_add(layer.mlp.experts.down_proj) + # We need at least one MLP down-projection. assert matrices