pytorch · manuelcandales · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -26,14 +26,16 @@ Arguments:
   quant_name   Quantization type (optional, default: non-quantized)
                Options:
                  - non-quantized
-                 - quantized-int4-tile-packed
-                 - quantized-int4-weight-only
+                 - quantized-int4-tile-packed (CUDA only)
+                 - quantized-int4-weight-only (CUDA only)
+                 - quantized-int4-metal (Metal only)
                  - quantized-8da4w (XNNPACK only)
 
   output_dir   Output directory for artifacts (optional, default: current directory)
 
 Examples:
   export_model_artifact.sh metal "openai/whisper-small"
+  export_model_artifact.sh metal "nvidia/parakeet-tdt" "quantized-int4-metal"
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
   export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
@@ -131,18 +133,25 @@ case "$QUANT_NAME" in
     ;;
   quantized-int4-tile-packed)
     if [ "$DEVICE" = "metal" ]; then
-      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      echo "Error: Metal backend does not support quantization '$QUANT_NAME'"
       exit 1
     fi
     EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
     ;;
   quantized-int4-weight-only)
     if [ "$DEVICE" = "metal" ]; then
-      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      echo "Error: Metal backend does not support quantization '$QUANT_NAME'"
       exit 1
     fi
     EXTRA_ARGS="--qlinear_encoder 4w"
     ;;
+  quantized-int4-metal)
+    if [ "$DEVICE" != "metal" ]; then
+      echo "Error: Quantization '$QUANT_NAME' only supported on Metal backend"
+      exit 1
+    fi
+    EXTRA_ARGS="--qlinear fpa4w --qlinear_encoder fpa4w"
+    ;;
   quantized-8da4w)
     if [ "$DEVICE" != "xnnpack" ]; then
       echo "Error: quantized-8da4w is only supported with xnnpack device"
@@ -152,7 +161,7 @@ case "$QUANT_NAME" in
     ;;
   *)
     echo "Error: Unsupported quantization '$QUANT_NAME'"
-    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-8da4w"
+    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only, quantized-int4-metal, quantized-8da4w"
     exit 1
     ;;
 esac

diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
@@ -41,11 +41,10 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch"
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
         echo "::endgroup::"
 
         echo "::group::Build Metal Runtime"
-        ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --update-ao
         ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --build
         echo "::endgroup::"
 
@@ -73,6 +72,12 @@ jobs:
             name: "parakeet-tdt"
         quant:
           - "non-quantized"
+        # Only test int4 quantization with parakeet-tdt
+        include:
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-metal"
     with:
       runner: macos-m2-stable
       python-version: '3.11'
@@ -123,6 +128,12 @@ jobs:
             name: "parakeet-tdt"
         quant:
           - "non-quantized"
+        # Only test int4 quantization with parakeet-tdt
+        include:
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-metal"
     with:
       runner: macos-m2-stable
       python-version: '3.11'

@@ -44,8 +44,12 @@ def get_decomposition_table(cls) -> Dict[Any, Any]:
 
     @classmethod
     def get_custom_passes(cls, compile_specs: List[CompileSpec]) -> List[typing.Any]:
-        """Return Metal-specific passes (currently none)"""
-        return []
+        """Return Metal-specific passes"""
+        from executorch.backends.apple.metal.passes.decompose_linear_pass import (
+            DecomposeLinearPass,
+        )
+
+        return [DecomposeLinearPass()]
 
     @classmethod
     def get_aoti_compile_options(

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.apple.metal.passes.decompose_linear_pass import (  # noqa: F401
+    DecomposeLinearPass,
+)
+
+__all__ = ["DecomposeLinearPass"]
@@ -0,0 +1,111 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class DecomposeLinearPass(ExportPass):
+    """
+    Decompose aten.linear into matmul + add to avoid addmm.
+
+    For 2D inputs, we unsqueeze to 3D before decomposition to force the matmul
+    code path instead of addmm. The C++ implementation of aten.linear directly
+    calls addmm for 2D inputs with bias, which would require implementing
+    aoti_torch_mps_addmm_out. By unsqueezing to 3D, we force the matmul path,
+    then squeeze back to 2D.
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        graph = graph_module.graph
+
+        for node in graph.nodes:
+            # Check if this is a linear operation
+            is_linear = False
+
+            if node.op == "call_function":
+                # Match both edge dialect and core aten linear operators
+                if node.target == exir_ops.edge.aten.linear.default:
+                    is_linear = True
+                elif node.target == torch.ops.aten.linear.default:
+                    is_linear = True
+
+            if is_linear:
+                # Get input, weight, and bias arguments
+                input_node = node.args[0]
+                weight_node = node.args[1]
+                bias_node = node.args[2] if len(node.args) > 2 else None
+
+                with graph.inserting_before(node):
+                    # Determine which ops to use based on the input operator
+                    target_str = str(node.target)
+
+                    if "executorch_exir_dialects_edge" in target_str:
+                        # Use edge dialect operators
+                        t_op = exir_ops.edge.aten.t.default
+                        matmul_op = exir_ops.edge.aten.matmul.default
+                        add_op = exir_ops.edge.aten.add.Tensor
+                        unsqueeze_op = exir_ops.edge.aten.unsqueeze.default
+                        squeeze_op = exir_ops.edge.aten.squeeze.dims
-                        unsqueeze_op = exir_ops.edge.aten.unsqueeze.default
-                        squeeze_op = exir_ops.edge.aten.squeeze.dims
+                        unsqueeze_op = exir_ops.edge.aten.unsqueeze_copy.default
+                        squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
-                        unsqueeze_op = exir_ops.edge.aten.unsqueeze.default
-                        squeeze_op = exir_ops.edge.aten.squeeze.dims
+                        unsqueeze_op = exir_ops.edge.aten.unsqueeze_copy.default
+                        squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+                    else:
+                        # Use core aten operators
+                        t_op = torch.ops.aten.t.default
+                        matmul_op = torch.ops.aten.matmul.default
+                        add_op = torch.ops.aten.add.Tensor
+                        unsqueeze_op = torch.ops.aten.unsqueeze.default
+                        squeeze_op = torch.ops.aten.squeeze.dims
+
+                    # Check if input is 2D
+                    needs_unsqueeze = False
+                    if hasattr(input_node, "meta") and "val" in input_node.meta:
+                        if len(input_node.meta["val"].shape) == 2:
+                            needs_unsqueeze = True
-                    if hasattr(input_node, "meta") and "val" in input_node.meta:
-                        if len(input_node.meta["val"].shape) == 2:
-                            needs_unsqueeze = True
+                    if hasattr(input_node, "meta"):
+                        val_meta = input_node.meta.get("val", None)
+                        if val_meta is not None and hasattr(val_meta, "shape"):
+                            if len(val_meta.shape) == 2:
+                                needs_unsqueeze = True
+                        else:
+                            raise RuntimeError(
+                                "DecomposeLinearPass requires input_node.meta['val'] with a 'shape' "
+                                "attribute to determine input dimensionality."
+                            )
-                    if hasattr(input_node, "meta") and "val" in input_node.meta:
-                        if len(input_node.meta["val"].shape) == 2:
-                            needs_unsqueeze = True
+                    if hasattr(input_node, "meta"):
+                        val_meta = input_node.meta.get("val", None)
+                        if val_meta is not None and hasattr(val_meta, "shape"):
+                            if len(val_meta.shape) == 2:
+                                needs_unsqueeze = True
+                        else:
+                            raise RuntimeError(
+                                "DecomposeLinearPass requires input_node.meta['val'] with a 'shape' "
+                                "attribute to determine input dimensionality."
+                            )
+
+                    # Unsqueeze 2D input to 3D: (M, K) -> (1, M, K)
+                    current_input = input_node
+                    if needs_unsqueeze:
+                        current_input = graph.call_function(
+                            unsqueeze_op,
+                            args=(input_node, 0),
+                        )
+
+                    # Decompose linear: matmul(input, weight.T) + bias
+                    weight_t = graph.call_function(
+                        t_op,
+                        args=(weight_node,),
+                    )
+
+                    matmul_result = graph.call_function(
+                        matmul_op,
+                        args=(current_input, weight_t),
+                    )
+
+                    if bias_node is not None:
+                        result = graph.call_function(
+                            add_op,
+                            args=(matmul_result, bias_node),
+                        )
+                    else:
+                        result = matmul_result
+
+                    # Squeeze 3D output back to 2D: (1, M, N) -> (M, N)
+                    if needs_unsqueeze:
+                        result = graph.call_function(
+                            squeeze_op,
+                            args=(result, [0]),
+                        )
+
+                # Replace all uses of the linear node with the decomposed result
+                node.replace_all_uses_with(result)
+                graph.erase_node(node)
+                modified = True
+
+        if modified:
+            graph_module.recompile()
+
+        return PassResult(graph_module, modified)
@@ -208,6 +208,22 @@ def forward(self, x: torch.Tensor):
 }
 
 
+# -------------------------------------------------------------------------
+class LinearWithBias(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(7, 101, bias=True)
+
+    def forward(self, x: torch.Tensor):
+        return self.linear(x)
+
+
+MODULE_REGISTRY["linear_bias"] = {
+    "model_class": LinearWithBias,
+    "input_shapes": [(127, 7)],
+    "description": "Simple linear layer model bias",
-    "description": "Simple linear layer model bias",
+    "description": "Simple linear layer model with bias",
-    "description": "Simple linear layer model bias",
+    "description": "Simple linear layer model with bias",
+
+
-
+}
-
+}
 # -------------------------------------------------------------------------
 class LinearNoBiasInt4(nn.Module):
     def __init__(self):

@@ -39,23 +39,24 @@ The export script supports quantizing encoder and decoder linear layers using [t
 
 | Argument | Description |
 |----------|-------------|
-| `--qlinear_encoder` | Quantization config for encoder linear layers: `4w`, `8w`, `8da4w`, `8da8w` |
+| `--qlinear_encoder` | Quantization config for encoder linear layers: `4w`, `8w`, `8da4w`, `8da8w`, `fpa4w` |
 | `--qlinear_encoder_group_size` | Group size for encoder linear quantization (default: 32) |
 | `--qlinear_encoder_packing_format` | Packing format for encoder: `tile_packed_to_4d` |
-| `--qlinear` | Quantization config for decoder linear layers: `4w`, `8w`, `8da4w`, `8da8w` |
+| `--qlinear` | Quantization config for decoder linear layers: `4w`, `8w`, `8da4w`, `8da8w`, `fpa4w` |
 | `--qlinear_group_size` | Group size for decoder linear quantization (default: 32) |
 | `--qlinear_packing_format` | Packing format for decoder: `tile_packed_to_4d` |
 | `--qembedding` | Quantization config for decoder embedding layer: `4w`, `8w` |
 | `--qembedding_group_size` | Group size for embedding quantization (default: 0 = per-axis) |
 
 #### Quantization Configs
 
-| Config | Description |
-|--------|-------------|
-| `4w` | 4-bit weight only quantization |
-| `8w` | 8-bit weight only quantization |
-| `8da4w` | 8-bit dynamic activation, 4-bit weight |
-| `8da8w` | 8-bit dynamic activation, 8-bit weight |
+| Config | Description | Backends |
+|--------|-------------|----------|
+| `4w` | 4-bit weight only quantization | CUDA |
+| `8w` | 8-bit weight only quantization | CUDA |
+| `8da4w` | 8-bit dynamic activation, 4-bit weight | CUDA |
+| `8da8w` | 8-bit dynamic activation, 8-bit weight | CUDA |
+| `fpa4w` | Floating point activation, 4-bit weight | Metal |
 
 #### Example: Dynamic Quantization for XNNPACK
 
@@ -86,6 +87,30 @@ python export_parakeet_tdt.py \
 
 **Note:** The `tile_packed_to_4d` packing format is optimized for CUDA.
 
+#### Example: Metal 4-bit Quantization
+
+```bash
+python export_parakeet_tdt.py \
+    --backend metal \
+    --qlinear_encoder fpa4w \
+    --qlinear_encoder_group_size 32 \
+    --qlinear fpa4w \
+    --qlinear_group_size 32 \
+    --output-dir ./parakeet_metal_quantized
+```
+
+**Note:** Metal 4-bit quantization requires torchao built with experimental MPS (Metal) ops.
+
+You can install torchao with Metal support from the `ao` repo:
+```bash
+USE_CPP=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 pip install . --no-build-isolation
+```
+
+Alternatively, you can build torchao with Metal support while installing ExecuTorch:
+```bash
+EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
+```
+
 ### Metal Export (macOS)
 
 ```bash

@@ -622,7 +622,7 @@ def main():
     parser.add_argument(
         "--qlinear",
         type=str,
-        choices=["4w", "8w", "8da4w", "8da8w"],
+        choices=["4w", "8w", "8da4w", "8da8w", "fpa4w"],
         help="Quantization config for decoder linear layers",
     )
     parser.add_argument(
@@ -642,7 +642,7 @@ def main():
     parser.add_argument(
         "--qlinear_encoder",
         type=str,
-        choices=["4w", "8w", "8da4w", "8da8w"],
+        choices=["4w", "8w", "8da4w", "8da8w", "fpa4w"],
         help="Quantization config for encoder linear layers",
     )
     parser.add_argument(
@@ -678,6 +678,12 @@ def main():
     if args.dtype == "fp16":
         parser.error("fp16 is not yet supported")
 
+    # Validate fpa4w quantization requires Metal backend
+    if args.qlinear == "fpa4w" and args.backend != "metal":
+        parser.error("--qlinear=fpa4w can only be used with --backend=metal")
+    if args.qlinear_encoder == "fpa4w" and args.backend != "metal":
+        parser.error("--qlinear_encoder=fpa4w can only be used with --backend=metal")
+
     os.makedirs(args.output_dir, exist_ok=True)
 
     print("Extracting tokenizer...")