diff --git a/src/heretic/model.py b/src/heretic/model.py index 41a8e71..2513091 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -389,6 +389,21 @@ class Model: for expert in layer.block_sparse_moe.experts: # ty:ignore[possibly-missing-attribute, not-iterable] try_add("mlp.down_proj", expert.w2) # ty:ignore[possibly-missing-attribute] + # LFM dense operator blocks. + with suppress(Exception): + try_add("attn.o_proj", layer.conv.out_proj) # ty:ignore[possibly-missing-attribute] + + with suppress(Exception): + try_add("mlp.down_proj", layer.feed_forward.w2) # ty:ignore[possibly-missing-attribute] + + # LFM transformer blocks. + with suppress(Exception): + try_add("attn.o_proj", layer.self_attn.out_proj) # ty:ignore[possibly-missing-attribute] + + with suppress(Exception): + for expert in layer.feed_forward.experts: # ty:ignore[possibly-missing-attribute, not-iterable] + try_add("mlp.down_proj", expert.w2) # ty:ignore[possibly-missing-attribute] + # Granite MoE Hybrid - attention layers with shared_mlp. with suppress(Exception): try_add("mlp.down_proj", layer.shared_mlp.output_linear) # ty:ignore[possibly-missing-attribute]