apache · Dayuxiaoshui · Mar 27, 2026 · Mar 27, 2026
diff --git a/include/tvm/relax/attrs/vision.h b/include/tvm/relax/attrs/vision.h
@@ -73,6 +73,28 @@ struct ROIAlignAttrs : public AttrsNodeReflAdapter<ROIAlignAttrs> {
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.ROIAlignAttrs", ROIAlignAttrs, BaseAttrsNode);
 };  // struct ROIAlignAttrs
 
+/*! \brief Attributes for multibox_transform_loc (SSD / TFLite-style box decode). */
+struct MultiboxTransformLocAttrs : public AttrsNodeReflAdapter<MultiboxTransformLocAttrs> {
+  bool clip;
+  double threshold;
+  ffi::Array<double> variances;
+  bool keep_background;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<MultiboxTransformLocAttrs>()
+        .def_ro("clip", &MultiboxTransformLocAttrs::clip, "Clip decoded ymin,xmin,ymax,xmax to [0,1].")
+        .def_ro("threshold", &MultiboxTransformLocAttrs::threshold,
+                "After softmax, zero scores strictly below this value.")
+        .def_ro("variances", &MultiboxTransformLocAttrs::variances,
+                "(x,y,w,h) scales = TFLite 1/x_scale,1/y_scale,1/w_scale,1/h_scale on encodings.")
+        .def_ro("keep_background", &MultiboxTransformLocAttrs::keep_background,
+                "If false, force output scores[:,0,:] to 0 (background class).");
+  }
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.MultiboxTransformLocAttrs",
+                                    MultiboxTransformLocAttrs, BaseAttrsNode);
+};  // struct MultiboxTransformLocAttrs
+
 }  // namespace relax
 }  // namespace tvm
 

diff --git a/python/tvm/relax/frontend/tflite/tflite_frontend.py b/python/tvm/relax/frontend/tflite/tflite_frontend.py
@@ -3205,9 +3205,10 @@ def convert_dequantize(self, op):
     def convert_detection_postprocess(self, op):
         """Convert TFLite_Detection_PostProcess"""
         raise NotImplementedError(
-            "DETECTION_POSTPROCESS requires vision ops (multibox_transform_loc, "
-            "non_max_suppression, get_valid_counts) not yet available in Relax. "
-            "See https://github.com/apache/tvm/issues/XXXX"
+            "DETECTION_POSTPROCESS is not wired in this frontend yet: it still needs "
+            "Relax NMS / get_valid_counts / related vision helpers (see dead code below). "
+            "relax.vision.multibox_transform_loc exists; tracking: "
+            "https://github.com/apache/tvm/issues/18928"
         )
         flexbuffer = op.CustomOptionsAsNumpy().tobytes()
         custom_options = FlexBufferDecoder(flexbuffer).decode()
@@ -3340,9 +3341,8 @@ def convert_nms_v5(self, op):
         """Convert TFLite NonMaxSuppressionV5"""
         # https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/non-max-suppression-v5
         raise NotImplementedError(
-            "NON_MAX_SUPPRESSION_V5 requires vision ops (get_valid_counts, "
-            "non_max_suppression) not yet available in Relax. "
-            "See https://github.com/apache/tvm/issues/XXXX"
+            "NON_MAX_SUPPRESSION_V5 is not wired in this frontend yet (needs get_valid_counts, "
+            "non_max_suppression, etc.). Tracking: https://github.com/apache/tvm/issues/18928"
         )
 
         input_tensors = self.get_input_tensors(op)

diff --git a/python/tvm/relax/op/__init__.py b/python/tvm/relax/op/__init__.py
@@ -157,7 +157,7 @@
     tanh,
     trunc,
 )
-from .vision import all_class_non_max_suppression, roi_align
+from .vision import all_class_non_max_suppression, multibox_transform_loc, roi_align
 
 
 def _register_op_make():

diff --git a/python/tvm/relax/op/op_attrs.py b/python/tvm/relax/op/op_attrs.py
@@ -251,6 +251,11 @@ class ROIAlignAttrs(Attrs):
     """Attributes for vision.roi_align"""
 
 
+@tvm_ffi.register_object("relax.attrs.MultiboxTransformLocAttrs")
+class MultiboxTransformLocAttrs(Attrs):
+    """Attributes for vision.multibox_transform_loc"""
+
+
 @tvm_ffi.register_object("relax.attrs.Conv1DAttrs")
 class Conv1DAttrs(Attrs):
     """Attributes for nn.conv1d"""

diff --git a/python/tvm/relax/op/vision/__init__.py b/python/tvm/relax/op/vision/__init__.py
@@ -17,5 +17,6 @@
 # under the License.
 """VISION operators."""
 
+from .multibox_transform_loc import *
 from .nms import *
 from .roi_align import *
diff --git a/python/tvm/relax/op/vision/multibox_transform_loc.py b/python/tvm/relax/op/vision/multibox_transform_loc.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Multibox location transform for object detection."""
+
+from . import _ffi_api
+
+
+def multibox_transform_loc(
+    cls_pred,
+    loc_pred,
+    anchor,
+    clip=False,
+    threshold=0.0,
+    variances=(1.0, 1.0, 1.0, 1.0),
+    keep_background=True,
+):
+    """SSD / TFLite-style decode: priors + offsets → boxes; logits → softmax scores.
+
+    Box decode follows TFLite ``DecodeCenterSizeBoxes``; expected tensor layout matches
+    ``tflite_frontend.convert_detection_postprocess`` (loc reorder yxhw→xywh, anchor ltrb).
+
+    Parameters
+    ----------
+    cls_pred : relax.Expr
+        ``[B, C, N]`` class logits (pre-softmax).
+    loc_pred : relax.Expr
+        ``[B, 4*N]`` per-anchor encodings as ``(x,y,w,h)`` after reorder (see above).
+    anchor : relax.Expr
+        ``[1, N, 4]`` priors: ``(left, top, right, bottom)``.
+    clip : bool
+        If True, clip ``ymin,xmin,ymax,xmax`` to ``[0, 1]``.
+    threshold : float
+        After softmax, multiply scores by mask ``(score >= threshold)``.
+    variances : tuple of 4 floats
+        ``(x,y,w,h)`` = TFLite ``1/x_scale, 1/y_scale, 1/w_scale, 1/h_scale``.
+    keep_background : bool
+        If False, set output scores at class index 0 to zero.
+
+    Returns
+    -------
+    result : relax.Expr
+        Tuple ``(boxes, scores)``: ``boxes`` is ``[B, N, 4]`` as ``(ymin,xmin,ymax,xmax)``;
+        ``scores`` is ``[B, C, N]`` softmax, post-processed like the implementation.
+
+    Notes
+    -----
+    **Shape/dtype (checked in ``FInferStructInfo`` when static):**
+
+    - ``cls_pred``: 3-D; ``loc_pred``: 2-D; ``anchor``: 3-D.
+    - ``cls_pred``, ``loc_pred``, ``anchor`` dtypes must match.
+    - ``N = cls_pred.shape[2]``; ``loc_pred.shape[1] == 4*N``; ``anchor.shape == [1,N,4]``.
+    - ``loc_pred.shape[1]`` must be divisible by 4.
+    - ``cls_pred.shape[0]`` must equal ``loc_pred.shape[0]`` (batch).
+    """
+    return _ffi_api.multibox_transform_loc(
+        cls_pred,
+        loc_pred,
+        anchor,
+        clip,
+        threshold,
+        variances,
+        keep_background,
+    )
diff --git a/python/tvm/relax/transform/legalize_ops/vision.py b/python/tvm/relax/transform/legalize_ops/vision.py
@@ -118,3 +118,27 @@ def _roi_align(bb: BlockBuilder, call: Call) -> Expr:
         aligned=call.attrs.aligned,
         layout=call.attrs.layout,
     )
+
+
+@register_legalize("relax.vision.multibox_transform_loc")
+def _multibox_transform_loc(bb: BlockBuilder, call: Call) -> Expr:
+    variances = tuple(float(x) for x in call.attrs.variances)
+
+    def _te(cls_pred, loc_pred, anchor):
+        return topi.vision.multibox_transform_loc(
+            cls_pred,
+            loc_pred,
+            anchor,
+            variances,
+            clip=call.attrs.clip,
+            threshold=call.attrs.threshold,
+            keep_background=call.attrs.keep_background,
+        )
+
+    return bb.call_te(
+        _te,
+        call.args[0],
+        call.args[1],
+        call.args[2],
+        primfunc_name_hint="multibox_transform_loc",
+    )
diff --git a/python/tvm/topi/testing/multibox_transform_loc_python.py b/python/tvm/topi/testing/multibox_transform_loc_python.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Numpy reference for multibox_transform_loc."""
+
+import numpy as np
+
+
+def _softmax(x, axis):
+    x_max = np.max(x, axis=axis, keepdims=True)
+    exp = np.exp(x - x_max)
+    return exp / np.sum(exp, axis=axis, keepdims=True)
+
+
+def multibox_transform_loc_python(
+    cls_pred,
+    loc_pred,
+    anchor,
+    variances,
+    clip=False,
+    threshold=0.0,
+    keep_background=True,
+):
+    """Reference implementation aligned with ``topi.vision.multibox_transform_loc``."""
+    B, C, N = cls_pred.shape
+    loc = loc_pred.reshape(B, N, 4)
+    scores = _softmax(cls_pred.astype("float64"), axis=1).astype(np.float32)
+    if threshold > 0.0:
+        scores = np.where(scores >= threshold, scores, 0.0).astype(np.float32)
+    if not keep_background:
+        scores = scores.copy()
+        scores[:, 0, :] = 0.0
+
+    vx, vy, vw, vh = variances
+    boxes = np.zeros((B, N, 4), dtype=np.float32)
+    for b in range(B):
+        for a in range(N):
+            l, t, r, br = anchor[0, a, :]
+            ay = (t + br) * 0.5
+            ax = (l + r) * 0.5
+            ah = br - t
+            aw = r - l
+            ex, ey, ew, eh = loc[b, a, :]
+            ycenter = ey * vy * ah + ay
+            xcenter = ex * vx * aw + ax
+            half_h = 0.5 * np.exp(eh * vh) * ah
+            half_w = 0.5 * np.exp(ew * vw) * aw
+            ymin = ycenter - half_h
+            xmin = xcenter - half_w
+            ymax = ycenter + half_h
+            xmax = xcenter + half_w
+            if clip:
+                ymin = np.clip(ymin, 0.0, 1.0)
+                xmin = np.clip(xmin, 0.0, 1.0)
+                ymax = np.clip(ymax, 0.0, 1.0)
+                xmax = np.clip(xmax, 0.0, 1.0)
+            boxes[b, a, :] = (ymin, xmin, ymax, xmax)
+    return boxes, scores
diff --git a/python/tvm/topi/vision/__init__.py b/python/tvm/topi/vision/__init__.py
@@ -17,5 +17,6 @@
 # under the License.
 """Vision operators."""
 
+from .multibox_transform_loc import *
 from .nms import *
 from .roi_align import *
diff --git a/python/tvm/topi/vision/multibox_transform_loc.py b/python/tvm/topi/vision/multibox_transform_loc.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Multibox location transform (SSD / TFLite DetectionPostProcess decode)."""
+
+import tvm
+from tvm import te, topi
+
+
+def multibox_transform_loc(
+    cls_pred,
+    loc_pred,
+    anchor,
+    variances,
+    clip=False,
+    threshold=0.0,
+    keep_background=True,
+):
+    """TFLite ``DecodeCenterSizeBoxes``-style decode + softmax score post-process.
+
+    Inputs must match Relax op contracts: ``cls_pred [B,C,N]``, ``loc_pred [B,4*N]``,
+    ``anchor [1,N,4]`` ltrb; per-anchor loc order ``(x,y,w,h)`` after yxhw→xywh reorder.
+
+    Parameters
+    ----------
+    cls_pred : te.Tensor
+        ``[B, C, N]`` logits.
+    loc_pred : te.Tensor
+        ``[B, 4*N]`` encodings ``(x,y,w,h)`` per anchor.
+    anchor : te.Tensor
+        ``[1, N, 4]`` ``(left, top, right, bottom)``.
+    variances : tuple of 4 float
+        ``(x,y,w,h)`` = ``1/x_scale, 1/y_scale, 1/w_scale, 1/h_scale`` (TFLite).
+    clip : bool
+        Clip ``ymin,xmin,ymax,xmax`` to ``[0,1]``.
+    threshold : float
+        After softmax: ``scores *= (scores >= threshold)``.
+    keep_background : bool
+        If False: ``scores[:,0,:] = 0``.
+
+    Returns
+    -------
+    boxes : te.Tensor
+        ``[B, N, 4]`` as ``(ymin,xmin,ymax,xmax)``.
+    scores : te.Tensor
+        ``[B, C, N]`` softmax, then threshold mask and optional background zero.
+    """
+    dtype = cls_pred.dtype
+    B = cls_pred.shape[0]
+    num_anchors = cls_pred.shape[2]
+    loc_reshaped = topi.reshape(loc_pred, [B, num_anchors, 4])
+
+    vx = tvm.tirx.const(float(variances[0]), dtype)
+    vy = tvm.tirx.const(float(variances[1]), dtype)
+    vw = tvm.tirx.const(float(variances[2]), dtype)
+    vh = tvm.tirx.const(float(variances[3]), dtype)
+    half = tvm.tirx.const(0.5, dtype)
+    zero = tvm.tirx.const(0.0, dtype)
+    one = tvm.tirx.const(1.0, dtype)
+    th = tvm.tirx.const(float(threshold), dtype)
+
+    def decode_bbox(b, a, k):
+        l = anchor[0, a, 0]
+        t = anchor[0, a, 1]
+        r = anchor[0, a, 2]
+        br = anchor[0, a, 3]
+        ay = (t + br) * half
+        ax = (l + r) * half
+        ah = br - t
+        aw = r - l
+        ex = loc_reshaped[b, a, 0]
+        ey = loc_reshaped[b, a, 1]
+        ew = loc_reshaped[b, a, 2]
+        eh = loc_reshaped[b, a, 3]
+        ycenter = ey * vy * ah + ay
+        xcenter = ex * vx * aw + ax
+        half_h = half * te.exp(eh * vh) * ah
+        half_w = half * te.exp(ew * vw) * aw
+        ymin = ycenter - half_h
+        xmin = xcenter - half_w
+        ymax = ycenter + half_h
+        xmax = xcenter + half_w
+        if clip:
+            ymin = te.max(zero, te.min(one, ymin))
+            xmin = te.max(zero, te.min(one, xmin))
+            ymax = te.max(zero, te.min(one, ymax))
+            xmax = te.max(zero, te.min(one, xmax))
+        return tvm.tirx.Select(
+            k == 0,
+            ymin,
+            tvm.tirx.Select(k == 1, xmin, tvm.tirx.Select(k == 2, ymax, xmax)),
+        )
+
+    boxes = te.compute((B, num_anchors, 4), decode_bbox, name="multibox_boxes")
+
+    scores = topi.nn.softmax(cls_pred, axis=1)
+    mask = topi.cast(topi.greater_equal(scores, th), dtype)
+    scores = scores * mask
+    if not keep_background:
+
+        def zero_bg(b, c, n):
+            s = scores[b, c, n]
+            return te.if_then_else(c == 0, zero, s)
+
+        scores = te.compute(scores.shape, zero_bg, name="multibox_scores_bg")
+
+    return [boxes, scores]
-Original file line number
+Diff line change
@@ Expand Up / @@ -157,7 +157,7 @@ @@
         tanh,
         trunc,
     )
-    from .vision import all_class_non_max_suppression, roi_align
+    from .vision import all_class_non_max_suppression, multibox_transform_loc, roi_align
     def _register_op_make():
@@ Expand Down @@