pytorch · cad-rlc · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -29,7 +29,6 @@ build-profiling/
 *.model
 *.etdump
 tokenizer.json
-*.pte
 *.ptd
 !test_bpe_tokenizer.bin
 !test_tiktoken_tokenizer.model

@@ -132,13 +132,21 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
-if(EXECUTORCH_OPTIMIZE_SIZE)
-  # -Os: Optimize for size.
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Os")
+if(NOT EXECUTORCH_BUILD_CADENCE)
+  if(OPTIMIZE_SIZE)
+    # -Os: Optimize for size
+    set(CMAKE_CXX_FLAGS_RELEASE "-Os ${CMAKE_CXX_FLAGS_RELEASE}")
+  else()
+    # -O2: Moderate opt.
+    set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
+  endif()
 else()
-  # -O2: Moderate opt.
-  set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
+  set(CMAKE_CXX_FLAGS_RELEASE
+    "-O3 -mcoproc -mlongcalls -LNO:simd -flto -ffunction-sections -fsigned-char -fno-exceptions -INLINE:requested -fno-zero-initialized-in-bss -mtext-section-literals -fmessage-length=0")
+  set(CMAKE_C_FLAGS_RELEASE
+    "-O3 -mcoproc -mlongcalls -LNO:simd -flto -ffunction-sections -fsigned-char -fno-exceptions -INLINE:requested -fno-zero-initialized-in-bss -mtext-section-literals -fmessage-length=0")
 endif()
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 
 if(EXECUTORCH_BUILD_TESTS)
   include(CTest)

@@ -90,7 +90,12 @@ elseif(EXECUTORCH_FUSION_G3_OPT)
   )
 elseif(EXECUTORCH_VISION_OPT)
   set(TARGET_DIR vision)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
+  add_compile_definitions(DRAM0_BUFF_SIZE=${DRAM0_BUFF_SIZE} DRAM1_BUFF_SIZE=${DRAM1_BUFF_SIZE})
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party
+    ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+  )
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/../generic/kernels)
 else()
   set(TARGET_DIR generic)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)

diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -339,22 +339,22 @@
     - arg_meta: null
       kernel_name: impl::generic::quantized_matmul_asym8uxasym8u_asym8u_out
 
-- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::im2row_out
 
-- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::im2row_per_tensor_out
 
-- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nchw_per_tensor_out
 
-- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out

diff --git a/backends/cadence/aot/functions_vision.yaml b/backends/cadence/aot/functions_vision.yaml
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -16,7 +16,7 @@
 from torch.library import impl, Library
 
 m = Library("cadence", "IMPL", "CompositeExplicitAutograd")
-torch.ops.load_library("//executorch/kernels/quantized:custom_ops_generated_lib")
+torch.ops.load_library("./pip-out/lib.linux-x86_64-cpython-311/executorch/kernels/quantized/libquantized_ops_aot_lib.so")
 
 # Registry to track all ops with reference implementations
 _REGISTERED_REF_IMPLEMENTATIONS: set[str] = set()

@@ -58,6 +58,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/delinearize_index.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp"
@@ -80,6 +81,7 @@ target_include_directories(
 add_library(
   custom_ops
   "quantized_linear_out.cpp"
+  "quantized_add_out.cpp"
   "quantized_conv2d_nchw_out.cpp"
   "quantized_conv2d_nhwc_out.cpp"
   "quantized_relu_out.cpp"

diff --git a/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp
@@ -280,7 +280,6 @@ void quantized_conv2d_nchw_per_tensor_out(
     int64_t output_zero_point,
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
-    bool channel_last,
     Tensor& out) {
   quantized_conv2d_nchw(
       input,

diff --git a/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp
@@ -267,7 +267,6 @@ void quantized_conv2d_nhwc_per_tensor_out(
     int64_t output_zero_point,
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
-    bool channel_last,
     Tensor& out) {
   quantized_conv2d_nhwc(
       input,

diff --git a/backends/cadence/vision/kernels/kernels.cpp b/backends/cadence/vision/kernels/kernels.cpp