From ed65d6902b6b583b231c9e9e5eee570b13728f84 Mon Sep 17 00:00:00 2001
From: Philipp Emanuel Weidmann <pew@worldwidemann.com>
Date: Wed, 15 Oct 2025 17:51:39 +0530
Subject: [PATCH] Support gpt-oss MoE

---
 src/heretic/model.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/heretic/model.py b/src/heretic/model.py
index ae24837..4655411 100644
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -112,6 +112,14 @@ class Model:
                 for expert in layer.block_sparse_moe.experts:
                     try_add(expert.w2.weight)
 
+        # gpt-oss MoE.
+        if not matrices:
+            with suppress(Exception):
+                # The implementation of gpt-oss in Transformers differs from many other MoE models
+                # in that it stores the down-projections for all experts in a single 3D tensor,
+                # but thanks to PyTorch's broadcasting magic, it all just works anyway.
+                try_add(layer.mlp.experts.down_proj)
+
         # We need at least one MLP down-projection.
         assert matrices