diff --git a/.gitmodules b/.gitmodules
index eab6041af..bca919479 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,7 @@
 [submodule "third_party/spdlog"]
 	path = third_party/spdlog
 	url = https://github.com/gabime/spdlog.git
+[submodule "third_party/nlohmann_json"]
+	path = third_party/nlohmann_json
+	url = https://github.com/nlohmann/json.git
+	branch = master
diff --git a/include/infinicore.hpp b/include/infinicore.hpp
index 95e4243d9..a7bd1a497 100644
--- a/include/infinicore.hpp
+++ b/include/infinicore.hpp
@@ -3,4 +3,5 @@
 #include "infinicore/device_event.hpp"
 #include "infinicore/nn.hpp"
 #include "infinicore/ops.hpp"
+#include "infinicore/quantization.hpp"
 #include "infinicore/tensor.hpp"
diff --git a/include/infinicore/nn/linear.hpp b/include/infinicore/nn/linear.hpp
index e77a432c2..667a980fa 100644
--- a/include/infinicore/nn/linear.hpp
+++ b/include/infinicore/nn/linear.hpp
@@ -1,8 +1,10 @@
 #pragma once
 
 #include "../ops.hpp"
+#include "../quantization.hpp"
 #include "module.hpp"
 #include <infiniccl.h>
+#include <optional>
 
 namespace infinicore::nn {
 
@@ -11,6 +13,9 @@ class BaseLinear : public Module {
     BaseLinear(size_t in_features, size_t out_features, bool bias = true,
                const DataType &dtype = DataType::F32, const Device &device = Device());
 
+    BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+               const DataType &dtype = DataType::F32, const Device &device = Device());
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
@@ -27,12 +32,17 @@ class BaseLinear : public Module {
     // Accessors for parameters
     Tensor weight() const { return weight_; }
     Tensor bias() const { return bias_; }
+    Tensor weight_scale() const { return weight_scale_; }
+    Tensor weight_zeros() const { return weight_zeros_; }
 
 protected:
     // Parameters
     INFINICORE_NN_PARAMETER(weight);
     INFINICORE_NN_PARAMETER(bias);
 
+    INFINICORE_NN_PARAMETER(weight_scale);
+    INFINICORE_NN_PARAMETER(weight_zeros);
+
 protected:
     // Helper method for common forward computation
     Tensor compute_linear(Tensor &input) const;
@@ -41,6 +51,7 @@ class BaseLinear : public Module {
     size_t out_features_;
     bool has_bias_;
     DataType dtype_;
+    std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_ = std::make_shared<infinicore::quantization::NoneQuantization>(nullptr);
 };
 
 } // namespace infinicore::nn
@@ -52,6 +63,9 @@ class Linear : public BaseLinear {
     Linear(size_t in_features, size_t out_features, bool bias = true,
            const DataType &dtype = DataType::F32, const Device &device = Device());
 
+    Linear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+           const DataType &dtype = DataType::F32, const Device &device = Device());
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
@@ -65,6 +79,10 @@ class ColumnParallelLinear : public BaseLinear {
                          const DataType &dtype = DataType::F32, const Device &device = Device(),
                          Size tp_rank = 0, Size tp_size = 1);
 
+    ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+                         const DataType &dtype = DataType::F32, const Device &device = Device(),
+                         Size tp_rank = 0, Size tp_size = 1);
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
@@ -82,6 +100,10 @@ class RowParallelLinear : public BaseLinear {
                       const DataType &dtype = DataType::F32, const Device &device = Device(),
                       Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
 
+    RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+                      const DataType &dtype = DataType::F32, const Device &device = Device(),
+                      Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
diff --git a/include/infinicore/ops/dequantize_awq.hpp b/include/infinicore/ops/dequantize_awq.hpp
new file mode 100644
index 000000000..50e4328f3
--- /dev/null
+++ b/include/infinicore/ops/dequantize_awq.hpp
@@ -0,0 +1,10 @@
+#pragma once
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+INFINICORE_GRAPH_OP_CLASS(DequantizeAWQ, Tensor, const Tensor &, const Tensor &, const Tensor &);
+
+void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/linear_w4a16_awq.hpp b/include/infinicore/ops/linear_w4a16_awq.hpp
new file mode 100644
index 000000000..ebae7a685
--- /dev/null
+++ b/include/infinicore/ops/linear_w4a16_awq.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+Tensor linear_w4a16_awq(Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias);
+
+void linear_w4a16_awq_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/linear_w8a8i8.hpp b/include/infinicore/ops/linear_w8a8i8.hpp
new file mode 100644
index 000000000..08cadc111
--- /dev/null
+++ b/include/infinicore/ops/linear_w8a8i8.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+Tensor linear_w8a8i8(Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias);
+
+void linear_w8a8i8_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/per_channel_quant_i8.hpp b/include/infinicore/ops/per_channel_quant_i8.hpp
new file mode 100644
index 000000000..0b0296248
--- /dev/null
+++ b/include/infinicore/ops/per_channel_quant_i8.hpp
@@ -0,0 +1,12 @@
+#pragma once
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(PerChannelQuantI8, const Tensor &, Tensor, Tensor);
+
+void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/scaled_mm_i8.hpp b/include/infinicore/ops/scaled_mm_i8.hpp
new file mode 100644
index 000000000..427ed9d65
--- /dev/null
+++ b/include/infinicore/ops/scaled_mm_i8.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(I8Gemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, std::optional<Tensor>);
+
+void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias);
+} // namespace infinicore::op
diff --git a/include/infinicore/quantization.hpp b/include/infinicore/quantization.hpp
new file mode 100644
index 000000000..7b01312ba
--- /dev/null
+++ b/include/infinicore/quantization.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "quantization/awq.hpp"
+#include "quantization/base_quantization.hpp"
+#include "quantization/compressed_tensors.hpp"
+#include "quantization/none_quantizaiton.hpp"
+#include "quantization/quantization_scheme.hpp"
diff --git a/include/infinicore/quantization/awq.hpp b/include/infinicore/quantization/awq.hpp
new file mode 100644
index 000000000..bbbbab1cb
--- /dev/null
+++ b/include/infinicore/quantization/awq.hpp
@@ -0,0 +1,19 @@
+#pragma once
+#include "base_quantization.hpp"
+namespace infinicore::quantization {
+
+class AWQ : public BaseQuantization {
+    // This is a temporary class that currently only returns AWQ_W4A16.
+    // Future enhancements should parse quant_config to extract detailed quantization
+    // information and support multiple quantization schemes.
+public:
+    explicit AWQ(const nlohmann::json &quant_config)
+        : BaseQuantization(quant_config) {};
+
+    infinicore::quantization::QuantScheme
+    get_quant_scheme() const override {
+        return infinicore::quantization::QuantScheme::AWQ_W4A16;
+    };
+};
+
+} // namespace infinicore::quantization
diff --git a/include/infinicore/quantization/base_quantization.hpp b/include/infinicore/quantization/base_quantization.hpp
new file mode 100644
index 000000000..4cc9b325e
--- /dev/null
+++ b/include/infinicore/quantization/base_quantization.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include "nlohmann/json.hpp"
+#include "quantization_scheme.hpp"
+
+namespace infinicore::quantization {
+class BaseQuantization {
+    // Base class for quantization schemes. Intended to be extended to support various quantization methods.
+public:
+    explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
+    virtual ~BaseQuantization() = default;
+
+    virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0;
+
+protected:
+    nlohmann::json quant_config_;
+};
+} // namespace infinicore::quantization
diff --git a/include/infinicore/quantization/compressed_tensors.hpp b/include/infinicore/quantization/compressed_tensors.hpp
new file mode 100644
index 000000000..0e3e45512
--- /dev/null
+++ b/include/infinicore/quantization/compressed_tensors.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "base_quantization.hpp"
+namespace infinicore::quantization {
+
+class CompressedTensors : public BaseQuantization {
+    // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
+    // Future enhancements should parse quant_config to extract detailed quantization
+    // information and support multiple quantization schemes.
+public:
+    explicit CompressedTensors(const nlohmann::json &quant_config)
+        : BaseQuantization(quant_config) {};
+
+    infinicore::quantization::QuantScheme
+    get_quant_scheme() const override {
+        return infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8;
+    };
+};
+
+} // namespace infinicore::quantization
diff --git a/include/infinicore/quantization/none_quantizaiton.hpp b/include/infinicore/quantization/none_quantizaiton.hpp
new file mode 100644
index 000000000..be5e4b377
--- /dev/null
+++ b/include/infinicore/quantization/none_quantizaiton.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "base_quantization.hpp"
+namespace infinicore::quantization {
+
+class NoneQuantization : public BaseQuantization {
+    // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
+    // Future enhancements should parse quant_config to extract detailed quantization
+    // information and support multiple quantization schemes.
+public:
+    explicit NoneQuantization(const nlohmann::json &quant_config)
+        : BaseQuantization(quant_config) {};
+
+    infinicore::quantization::QuantScheme
+    get_quant_scheme() const override {
+        return infinicore::quantization::QuantScheme::NONE;
+    };
+};
+
+} // namespace infinicore::quantization
diff --git a/include/infinicore/quantization/quantization_scheme.hpp b/include/infinicore/quantization/quantization_scheme.hpp
new file mode 100644
index 000000000..b4a8bc29c
--- /dev/null
+++ b/include/infinicore/quantization/quantization_scheme.hpp
@@ -0,0 +1,12 @@
+// quant.hpp
+#pragma once
+
+namespace infinicore::quantization {
+
+enum class QuantScheme {
+    NONE,
+    COMPRESSED_TENSOR_W8A8I8,
+    AWQ_W4A16,
+};
+
+} // namespace infinicore::quantization
diff --git a/include/infiniop.h b/include/infiniop.h
index 0ea2e2bc0..f03832b43 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -13,6 +13,7 @@
 #include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/int8_gemm.h"
 #include "infiniop/ops/kv_caching.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
@@ -22,6 +23,7 @@
 #include "infiniop/ops/paged_attention.h"
 #include "infiniop/ops/paged_attention_prefill.h"
 #include "infiniop/ops/paged_caching.h"
+#include "infiniop/ops/quant/per_channel_quant_int8.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
diff --git a/include/infiniop/ops/quant/per_channel_quant_int8.h b/include/infiniop/ops/quant/per_channel_quant_int8.h
new file mode 100644
index 000000000..ce21f4556
--- /dev/null
+++ b/include/infiniop/ops/quant/per_channel_quant_int8.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
+
+__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                                      infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                                      infiniopTensorDescriptor_t x_packed_desc,
+                                                                      infiniopTensorDescriptor_t x_scale_desc,
+                                                                      infiniopTensorDescriptor_t x_zero_desc,
+                                                                      infiniopTensorDescriptor_t x_desc);
+
+__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                                      void *workspace,
+                                                      size_t workspace_size,
+                                                      void *x_packed,
+                                                      void *x_scale,
+                                                      void *x_zero,
+                                                      const void *x,
+                                                      void *stream);
+
+__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
+
+#endif
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index e1ae309f5..bae47e33c 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -2,6 +2,7 @@
 from .embedding import embedding
 from .flash_attention import flash_attention
 from .linear import linear
+from .linear_w8a8i8 import linear_w8a8i8
 from .random_sample import random_sample
 from .rms_norm import rms_norm
 from .rope import RopeAlgo, rope
@@ -19,4 +20,5 @@
     "rope",
     "silu",
     "swiglu",
+    "linear_w8a8i8",
 ]
diff --git a/python/infinicore/nn/functional/linear_w8a8i8.py b/python/infinicore/nn/functional/linear_w8a8i8.py
new file mode 100644
index 000000000..33cb59b0e
--- /dev/null
+++ b/python/infinicore/nn/functional/linear_w8a8i8.py
@@ -0,0 +1,31 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def linear_w8a8i8(
+    input: Tensor,
+    weight_packed: Tensor,
+    weight_scale: Tensor,
+    bias=None,
+    out=None,
+) -> Tensor:
+    r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""
+
+    if out is None:
+        return Tensor(
+            _infinicore.linear_w8a8i8(
+                input._underlying,
+                weight_packed._underlying,
+                weight_scale._underlying,
+                None if bias is None else bias._underlying,
+            )
+        )
+
+    _infinicore.linear_w8a8i8_(
+        out._underlying,
+        input._underlying,
+        weight_packed._underlying,
+        weight_scale._underlying,
+        None if bias is None else bias._underlying,
+    )
+    return out
diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc
index 0be993699..d8e5a1c76 100644
--- a/src/infinicore/nn/linear.cc
+++ b/src/infinicore/nn/linear.cc
@@ -3,6 +3,7 @@
 #include "infinicore/ops.hpp"
 #include "infinicore/ops/distributed/allreduce.hpp"
 #include "infinicore/ops/linear.hpp"
+#include "infinicore/ops/linear_w8a8i8.hpp"
 #include <optional>
 #include <spdlog/spdlog.h>
 
@@ -18,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias,
     device_ = device;
 }
 
-Tensor BaseLinear::compute_linear(Tensor &input) const {
+BaseLinear::BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+                       const DataType &dtype, const Device &device)
+    : in_features_(in_features),
+      out_features_(out_features),
+      quantization_(quantization),
+      has_bias_(bias),
+      dtype_(dtype) {
 
-    // Ensure input is contiguous before creating views (required for matmul)
-    // This prevents hanging when input tensor has non-contiguous memory layout
-    Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
+    device_ = device;
+}
 
-    // Use ops::linear_ directly to match Python backend's exact code path
-    // This ensures identical computation and numerical results
-    // Parameter inherits from Tensor, so we cast to Tensor explicitly
-    Tensor weight_tensor = static_cast<const Tensor &>(weight_);
-    std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
+Tensor BaseLinear::compute_linear(Tensor &input) const {
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+        Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
 
-    auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
-    return output;
-}
+        Tensor weight_packed_tensor = static_cast<const Tensor &>(weight_);
+        Tensor weight_scale_tensor = static_cast<const Tensor &>(weight_scale_);
+        // weight_packed should be transposed and non-contiguous.
+        std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
+
+        auto output = infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight_packed_tensor, weight_scale_tensor, bias_opt);
+        return output;
+    }
+    default: {
+        // Ensure input is contiguous before creating views (required for matmul)
+        // This prevents hanging when input tensor has non-contiguous memory layout
+        Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
+
+        // Use ops::linear_ directly to match Python backend's exact code path
+        // This ensures identical computation and numerical results
+        // Parameter inherits from Tensor, so we cast to Tensor explicitly
+        Tensor weight_tensor = static_cast<const Tensor &>(weight_);
+        std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
+
+        auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt);
+        return output;
+    }
+    }
+} // namespace infinicore::nn
 
 Tensor BaseLinear::forward(Tensor &input) const {
     return compute_linear(input);
@@ -71,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias,
     //              in_features, out_features, bias, static_cast<int>(dtype_));
 }
 
+Linear::Linear(size_t in_features, size_t out_features,
+               std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+               const DataType &dtype, const Device &device)
+    : BaseLinear(in_features, out_features, quantization, bias, dtype, device_) {
+
+    device_ = device;
+
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device));
+        INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device));
+
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
+    default: {
+        // Initialize parameters using macro
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device));
+
+        // Register bias parameter if requested
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
+        } else {
+            bias_ = Parameter(); // Default constructed empty parameter
+        }
+
+        // SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
+        //              in_features, out_features, bias, static_cast<int>(dtype_));
+        break;
+    }
+    }
+}
+
 Tensor Linear::forward(Tensor &input) const {
     return BaseLinear::forward(input);
 }
@@ -105,6 +168,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
     }
 }
 
+ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+                                           const DataType &dtype, const Device &device,
+                                           Size tp_rank, Size tp_size)
+    : BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
+      tp_rank_(tp_rank),
+      tp_size_(tp_size) {
+
+    device_ = device;
+
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 0, tp_rank_, tp_size_));
+        INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, tp_rank_, tp_size_));
+
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
+    default: {
+        // Initialize parameters using macro
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
+                                              0, tp_rank_, tp_size_));
+
+        // Register bias parameter if requested
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device,
+                                                0, tp_rank_, tp_size_));
+        } else {
+            bias_ = Parameter(); // Default constructed empty parameter
+        }
+        break;
+    }
+    }
+}
+
 Tensor ColumnParallelLinear::forward(Tensor &input) const {
     return BaseLinear::forward(input);
 }
@@ -138,6 +240,46 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo
     }
 }
 
+RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+                                     const DataType &dtype, const Device &device,
+                                     Size tp_rank, Size tp_size, infinicclComm_t communicator)
+    : BaseLinear(in_features, out_features, quantization, bias, dtype, device_),
+      tp_rank_(tp_rank),
+      tp_size_(tp_size), communicator_(communicator) {
+
+    device_ = device;
+
+    switch (this->quantization_->get_quant_scheme()) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: {
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 1, tp_rank_, tp_size_));
+        INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, 0, 1));
+
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
+    default: {
+        // Initialize parameters using macro
+        INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
+                                              1, tp_rank_, tp_size_));
+
+        // Register bias parameter if requested
+        if (bias && (0 == tp_rank_)) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
+        } else {
+            bias_ = Parameter(); // Default constructed empty parameter
+        }
+
+        // SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
+        //              in_features, out_features, bias, static_cast<int>(dtype_));
+        break;
+    }
+    }
+}
+
 Tensor RowParallelLinear::forward(Tensor &input) const {
     auto output = BaseLinear::forward(input);
 
diff --git a/src/infinicore/ops/dequantize_awq/dequantize_awq.cc b/src/infinicore/ops/dequantize_awq/dequantize_awq.cc
new file mode 100644
index 000000000..dff92b6ec
--- /dev/null
+++ b/src/infinicore/ops/dequantize_awq/dequantize_awq.cc
@@ -0,0 +1,20 @@
+#include "infinicore/ops/dequantize_awq.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DequantizeAWQ);
+
+DequantizeAWQ::DequantizeAWQ(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zeros);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zeros);
+}
+
+void DequantizeAWQ::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(DequantizeAWQ, x, x_packed, x_scale, x_zeros);
+}
+
+void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    DequantizeAWQ::execute(x, x_packed, x_scale, x_zeros);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc b/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc
new file mode 100644
index 000000000..3e643ee40
--- /dev/null
+++ b/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc
@@ -0,0 +1,56 @@
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/dequantize_awq.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::dequantize_awq_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, DequantizeAWQ, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, x_packed, x_scale, x_zeros;
+};
+
+void *plan(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    size_t seed = hash_combine(x, x_packed, x_scale, x_zeros);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, DequantizeAWQ,
+        seed,
+        x->desc(), x_packed->desc(), x_scale->desc(), x_zeros->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, DequantizeAWQ, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(x_packed),
+        graph::GraphTensor(x_scale),
+        graph::GraphTensor(x_zeros)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopDequantizeAWQ(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->x_packed->data(),
+        planned->x_scale->data(),
+        planned->x_zeros->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(DequantizeAWQ, &plan, &run, &cleanup);
+} // namespace infinicore::op::dequantize_awq_impl::infiniop
diff --git a/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc b/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
new file mode 100644
index 000000000..2b0255c4b
--- /dev/null
+++ b/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
@@ -0,0 +1,60 @@
+#include "infinicore/ops/linear_w4a16_awq.hpp"
+#include "infinicore/ops/dequantize_awq.hpp"
+#include "infinicore/ops/gemm.hpp"
+
+namespace infinicore::op {
+
+Tensor linear_w4a16_awq(Tensor input,
+                        Tensor weight_packed,
+                        Tensor weight_scale,
+                        Tensor weight_zeros,
+                        std::optional<Tensor> bias) {
+
+    // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
+    Size ndim = input->ndim();
+    Size out_features = weight_packed->shape()[0];
+
+    // Assign memory to out variables
+    auto output_shape = input->shape();
+    output_shape[ndim - 1] = out_features;
+    auto out = Tensor::empty(output_shape, input->dtype(), input->device());
+
+    // Inplace Calculate
+    linear_w4a16_awq_(out, input, weight_packed, weight_scale, weight_zeros, bias);
+    return out;
+}
+
+void linear_w4a16_awq_(Tensor out,
+                       Tensor input,
+                       Tensor weight_packed,
+                       Tensor weight_scale,
+                       Tensor weight_zeros,
+                       std::optional<Tensor> bias) {
+
+    auto weight_packed_shape = weight_packed->shape();
+    Size out_features = weight_packed_shape[0];
+    Size in_features = weight_packed_shape[1];
+
+    Size ndim = input->ndim();
+    assert(out->ndim() == ndim);
+
+    Size N = 1;
+    auto input_shape = input->shape();
+    for (size_t i = 0; i < ndim - 1; ++i) {
+        N *= input_shape[i];
+    }
+
+    auto weight = Tensor::empty(
+        {out_features, in_features},
+        out->dtype(),
+        weight_packed->device());
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    op::dequantize_awq_(weight, weight_packed, weight_scale, weight_zeros);
+    bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
+    gemm_(out->view({N, out_features}),
+          input->view({N, in_features}),
+          weight->permute({1, 0}), alpha, beta);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc b/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
new file mode 100644
index 000000000..d69e0e7a2
--- /dev/null
+++ b/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
@@ -0,0 +1,66 @@
+#include "infinicore/ops/linear_w8a8i8.hpp"
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include "infinicore/ops/scaled_mm_i8.hpp"
+
+namespace infinicore::op {
+
+Tensor linear_w8a8i8(Tensor input,
+                     Tensor weight_packed,
+                     Tensor weight_scale,
+                     std::optional<Tensor> bias) {
+
+    // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
+    Size ndim = input->ndim();
+    Size out_features = weight_packed->shape()[0];
+
+    // Assign memory to out variables
+    auto output_shape = input->shape();
+    output_shape[ndim - 1] = out_features;
+    auto out = Tensor::empty(output_shape, input->dtype(), input->device());
+
+    // Inplace Calculate
+    linear_w8a8i8_(out, input, weight_packed, weight_scale, bias);
+    return out;
+}
+
+void linear_w8a8i8_(Tensor out,
+                    Tensor input,
+                    Tensor weight_packed,
+                    Tensor weight_scale,
+                    std::optional<Tensor> bias) {
+
+    auto weight_packed_shape = weight_packed->shape();
+    Size out_features = weight_packed_shape[0];
+    Size in_features = weight_packed_shape[1];
+
+    Size ndim = input->ndim();
+    assert(out->ndim() == ndim);
+
+    Size N = 1;
+    auto input_shape = input->shape();
+    for (size_t i = 0; i < ndim - 1; ++i) {
+        N *= input_shape[i];
+    }
+
+    auto input_packed = Tensor::empty(
+        {N, input_shape[ndim - 1]},
+        DataType::I8,
+        input->device());
+    auto input_scale = Tensor::empty(
+        {N, 1},
+        DataType::F32,
+        input->device());
+    op::per_channel_quant_i8_(input->view({N, in_features}), input_packed, input_scale);
+    if (bias.has_value()) {
+        bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
+    }
+    op::scaled_mm_i8_(
+        out->view({N, out_features}),
+        input_packed,
+        input_scale,
+        weight_packed->permute({1, 0}),
+        weight_scale,
+        bias);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
new file mode 100644
index 000000000..40ddefbfe
--- /dev/null
+++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
@@ -0,0 +1,20 @@
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerChannelQuantI8);
+
+PerChannelQuantI8::PerChannelQuantI8(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale);
+}
+
+void PerChannelQuantI8::execute(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerChannelQuantI8, x, x_packed, x_scale);
+}
+
+void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    PerChannelQuantI8::execute(x, x_packed, x_scale);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
new file mode 100644
index 000000000..569c9fdae
--- /dev/null
+++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
@@ -0,0 +1,56 @@
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::per_channel_quant_i8_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerChannelQuantI8, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, x_packed, x_scale;
+};
+
+void *plan(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    size_t seed = hash_combine(x, x_packed, x_scale);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PerChannelQuantI8,
+        seed,
+        x_packed->desc(), x_scale->desc(), nullptr, x->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PerChannelQuantI8, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(x_packed),
+        graph::GraphTensor(x_scale)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopPerChannelQuantI8(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x_packed->data(),
+        planned->x_scale->data(),
+        nullptr,
+        planned->x->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerChannelQuantI8, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::per_channel_quant_i8_impl::infiniop
diff --git a/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc
new file mode 100644
index 000000000..6d6a5f8ff
--- /dev/null
+++ b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc
@@ -0,0 +1,21 @@
+#include "infinicore/ops/scaled_mm_i8.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(I8Gemm);
+
+I8Gemm::I8Gemm(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a_p, a_s, b_p, b_s);
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a_p, a_s, b_p, b_s, bias);
+}
+void I8Gemm::execute(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(I8Gemm, c, a_p, a_s, b_p, b_s, bias);
+}
+
+void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    I8Gemm::execute(c, a_p, a_s, b_p, b_s, bias);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc
new file mode 100644
index 000000000..952b570cc
--- /dev/null
+++ b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc
@@ -0,0 +1,65 @@
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/scaled_mm_i8.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::scaled_mm_i8_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, I8Gemm, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, c, a_p, a_s, b_p, b_s;
+    std::optional<graph::GraphTensor> bias;
+};
+
+void *plan(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    size_t seed = hash_combine(c, a_p, a_s, b_p, b_s);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, I8Gemm,
+        seed,
+        c->desc(), bias.has_value() ? bias.value()->desc() : nullptr,
+        a_p->desc(), a_s->desc(), b_p->desc(), b_s->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, I8Gemm, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a_p),
+        graph::GraphTensor(a_s),
+        graph::GraphTensor(b_p),
+        graph::GraphTensor(b_s),
+        // bias.has_value() ? bias.value()->desc() : nullptr};
+        bias ? std::optional<graph::GraphTensor>(graph::GraphTensor(*bias)) : std::nullopt};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopI8Gemm(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->c->data(),
+        // planned->bias->data(),
+        planned->bias.has_value() ? planned->bias.value()->data() : nullptr,
+        planned->a_p->data(),
+        planned->a_s->data(),
+        planned->b_p->data(),
+        planned->b_s->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(I8Gemm, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::scaled_mm_i8_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index c7dcae6ca..fd3aaf3ff 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -10,6 +10,7 @@
 #include "ops/flash_attention.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
+#include "ops/linear_w8a8i8.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
 #include "ops/paged_attention.hpp"
@@ -46,6 +47,7 @@ inline void bind(py::module &m) {
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
+    bind_linear_w8a8i8(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/linear_w8a8i8.hpp b/src/infinicore/pybind11/ops/linear_w8a8i8.hpp
new file mode 100644
index 000000000..926d554b1
--- /dev/null
+++ b/src/infinicore/pybind11/ops/linear_w8a8i8.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/linear_w8a8i8.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+Tensor py_linear_w8a8i8(Tensor input,
+                        Tensor weight_packed,
+                        Tensor weight_scale,
+                        pybind11::object bias) {
+    std::optional<Tensor> bias_tensor = std::nullopt;
+    if (!bias.is_none()) {
+        bias_tensor = bias.cast<Tensor>();
+    }
+    return op::linear_w8a8i8(input, weight_packed, weight_scale, bias_tensor);
+}
+
+void py_linear_w8a8i8_(Tensor out,
+                       Tensor input,
+                       Tensor weight_packed,
+                       Tensor weight_scale,
+                       pybind11::object bias) {
+
+    std::optional<Tensor> bias_tensor = std::nullopt;
+    if (!bias.is_none()) {
+        bias_tensor = bias.cast<Tensor>();
+    }
+
+    op::linear_w8a8i8_(out, input, weight_packed, weight_scale, bias_tensor);
+}
+
+inline void bind_linear_w8a8i8(py::module &m) {
+    m.def("linear_w8a8i8",
+          &ops::py_linear_w8a8i8,
+          py::arg("input"),
+          py::arg("weight_packed"),
+          py::arg("weight_scale"),
+          py::arg("bias") = py::none(),
+          R"doc(linear_w8a8i8.)doc");
+    m.def("linear_w8a8i8_",
+          &ops::py_linear_w8a8i8_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("weight_packed"),
+          py::arg("weight_scale"),
+          py::arg("bias") = py::none(),
+          R"doc(linear_w8a8i8_.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp b/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp
new file mode 100644
index 000000000..da6f9f592
--- /dev/null
+++ b/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_per_channel_quant_i8(py::module &m) {
+    m.def("per_channel_quant_i8_",
+          &op::per_channel_quant_i8_,
+          py::arg("x"),
+          py::arg("x_packed"),
+          py::arg("x_scale"),
+          R"doc(Per-channel quantization of a tensor.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/scaled_mm_i8.hpp b/src/infinicore/pybind11/ops/scaled_mm_i8.hpp
new file mode 100644
index 000000000..c3d46d9df
--- /dev/null
+++ b/src/infinicore/pybind11/ops/scaled_mm_i8.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/scaled_mm_i8.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_scaled_mm_i8(py::module &m) {
+    m.def("scaled_mm_i8",
+          &op::scaled_mm_i8,
+          py::arg("a_p"),
+          py::arg("a_s"),
+          py::arg("b_p"),
+          py::arg("b_s"),
+          py::arg("bias"),
+          R"doc(Scaled matrix multiplication of two tensors.)doc");
+
+    m.def("scaled_mm_i8_",
+          &op::scaled_mm_i8_,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg("a_scale"),
+          py::arg("b_scale"),
+          R"doc(In-place Scaled matrix multiplication of two tensors.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/tensor/debug.cc b/src/infinicore/tensor/debug.cc
index 0ae1946e3..b57b00a52 100644
--- a/src/infinicore/tensor/debug.cc
+++ b/src/infinicore/tensor/debug.cc
@@ -95,6 +95,20 @@ void print_data_bf16(const uint16_t *data, const Shape &shape, const Strides &st
     }
 }
 
+// Function for printing I8 data
+void print_data_i8(const int8_t *data, const Shape &shape, const Strides &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << static_cast<int>(data[i * strides[dim]]) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data_i8(data + i * strides[dim], shape, strides, dim + 1);
+        }
+    }
+}
+
 // Template function for writing data recursively to binary file (handles non-contiguous tensors)
 template <typename T>
 void write_binary_data(std::ofstream &out, const T *data, const Shape &shape, const Strides &strides, size_t dim) {
@@ -181,8 +195,8 @@ void TensorImpl::debug(const std::string &filename) const {
                    cpu_tensor->shape(), cpu_tensor->strides(), 0);
         break;
     case DataType::I8:
-        print_data(reinterpret_cast<const int8_t *>(cpu_data),
-                   cpu_tensor->shape(), cpu_tensor->strides(), 0);
+        print_data_i8(reinterpret_cast<const int8_t *>(cpu_data),
+                      cpu_tensor->shape(), cpu_tensor->strides(), 0);
         break;
     case DataType::BF16:
         print_data_bf16(reinterpret_cast<const uint16_t *>(cpu_data),
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
new file mode 100644
index 000000000..3c014de9b
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
@@ -0,0 +1,273 @@
+#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+
+#include <cub/block/block_reduce.cuh>
+__device__ inline int round_half_away_from_zero(float x) {
+    float ax = fabsf(x);
+    float r = floorf(ax + 0.5f);
+    return (x >= 0.0f) ? (int)r : -(int)r;
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockPerChannelQuantI8Kernel(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    // ---- 1. reduce max ----
+    float local_max = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(
+        x + tid, K);
+
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_min = __FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_min = fminf(thread_min, (float)x[tid + ind]);
+    }
+#if CUDART_VERSION >= 12090
+    float local_min = BlockReduce(temp_storage).Reduce(thread_min, ::cuda::minimum());
+#else
+    float local_min = BlockReduce(temp_storage).Reduce(thread_min, cub::Min());
+#endif
+    __shared__ float global_min_f;
+    if (threadIdx.x == 0) {
+        global_min_f = local_min;
+    }
+    __syncthreads();
+
+    float global_max = global_max_f;
+    float global_min = global_min_f;
+
+    float scale = (global_max - global_min) / 255.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+    float zero = -global_min * inv_scale - 128.0f;
+
+    x_scale[row] = (Tdata)scale;
+    x_zero[row] = (Tdata)zero;
+
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale + zero;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -128) {
+            q = -128;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockPerChannelQuantI8SymKernel(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_max = -__FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_max = fmaxf(thread_max, fabs((float)x[tid + ind]));
+    }
+#if CUDART_VERSION >= 12090
+    float local_max = BlockReduce(temp_storage).Reduce(thread_max, ::cuda::maximum());
+#else
+    float local_max = BlockReduce(temp_storage).Reduce(thread_max, cub::Max());
+#endif
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    float global_max = global_max_f;
+
+    float scale = global_max / 127.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+
+    x_scale[row] = (Tdata)scale;
+
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -127) {
+            q = -127;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+
+template <typename T>
+struct MaxOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return max(a, b);
+    }
+};
+template <typename T>
+struct MinOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return min(a, b);
+    }
+};
+template <template <typename> class ReductionOp, typename T,
+          int thread_group_width>
+__inline__ __device__ T WarpAllReduce(T val) {
+    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+        val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpPerChannelQuantI8Kernel(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+        __shared__ float min_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+        float min_data = __FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            max_data = fmaxf(max_data, v);
+            min_data = fminf(min_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+        min_data = WarpAllReduce<MinOp, float, BLOCK_SIZE_x>(min_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+            min_total[threadIdx.y] = min_data;
+        }
+        __syncthreads();
+
+        float max_f = max_total[threadIdx.y];
+        float min_f = min_total[threadIdx.y];
+
+        float scale = (max_f - min_f) / 255.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+        float zero = -min_f * inv_scale - 128.0f;
+
+        x_scale[otherIdx] = scale;
+        x_zero[otherIdx] = zero;
+
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale + zero;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -128) {
+                q = -128;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpPerChannelQuantI8SymKernel(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = fabs((float)x[tid + ind]);
+            max_data = fmaxf(max_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+        }
+        __syncthreads();
+
+        float max_f = max_total[threadIdx.y];
+
+        float scale = max_f / 127.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+
+        x_scale[otherIdx] = scale;
+
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -127) {
+                q = -127;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+
+#endif // __PERCHANNEL_QUANTINT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/info.h b/src/infiniop/ops/quant/per_channel_quant_int8/info.h
new file mode 100644
index 000000000..6a8295753
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/info.h
@@ -0,0 +1,59 @@
+#ifndef __PER_CHANNEL_QUANT_INT8_INFO_H__
+#define __PER_CHANNEL_QUANT_INT8_INFO_H__
+
+#include "../../../../utils.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::per_channel_quant_int8 {
+
+class PerChannelQuantI8Info {
+private:
+    PerChannelQuantI8Info() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t M, K;
+
+    static utils::Result<PerChannelQuantI8Info> createPerChannelQuantI8Info(
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc,
+        infiniopTensorDescriptor_t x_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 2
+                            && x_packed_desc->ndim() == 2
+                            && x_scale_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t M = x_desc->dim(0);
+        size_t K = x_desc->dim(1);
+
+        CHECK_OR_RETURN(M == x_packed_desc->dim(0)
+                            || K == x_packed_desc->dim(1)
+                            || M == x_scale_desc->dim(0)
+                            || 1 == x_scale_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<PerChannelQuantI8Info>(PerChannelQuantI8Info{
+            dtype,
+            packed_type,
+            M,
+            K,
+        });
+    }
+};
+
+} // namespace op::per_channel_quant_int8
+
+#endif //  __PER_CHANNEL_QUANT_INT8_INFO_H__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
new file mode 100644
index 000000000..24fa80c6e
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
@@ -0,0 +1,118 @@
+#include "../../../../devices/nvidia/nvidia_common.cuh"
+#include "per_channel_quant_int8_nvidia.cuh"
+
+#include "../../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x, M, K);
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x, M, K);
+}
+
+namespace op::per_channel_quant_int8::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto info = PerChannelQuantI8Info::createPerChannelQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t per_channel_quant_int8Kernel(const PerChannelQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, cudaStream_t stream) {
+    int M = (int)info.M;
+    int K = (int)info.K;
+
+    if (K >= 1024) {
+        if (x_zero == nullptr) {
+            blockPerChannelQuantI8Sym<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            blockPerChannelQuantI8<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+
+    } else {
+        constexpr unsigned int BLOCK_SIZE_x = 32;
+        constexpr unsigned int BLOCK_SIZE_y = 32;
+        int num_block_x = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+        dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+        if (x_zero == nullptr) {
+            warpPerChannelQuantI8Sym<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            warpPerChannelQuantI8<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x_packed, void *x_scale, void *x_zero, const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define QUANT(BLOCK_SIZE, TDATA) \
+    per_channel_quant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, stream)
+#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)         \
+            return QUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)    \
+            return QUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)   \
+            return QUANT(BLOCK_SIZE, __nv_bfloat16); \
+        else                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;   \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::per_channel_quant_int8::nvidia
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
new file mode 100644
index 000000000..9a7a2872d
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
+#define __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
+#include "../per_channel_quant_int8.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc b/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
new file mode 100644
index 000000000..dade91c88
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
@@ -0,0 +1,98 @@
+#include "../../../operator.h"
+#include "../../../handle.h"
+#include "infiniop/ops/quant/per_channel_quant_int8.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/per_channel_quant_int8_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                             infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t x_packed_desc,
+                                                             infiniopTensorDescriptor_t x_scale_desc,
+                                                             infiniopTensorDescriptor_t x_zero_desc,
+                                                             infiniopTensorDescriptor_t x_desc) {
+#define CREATE(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                                \
+        return op::per_channel_quant_int8::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                           \
+            reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_packed_desc,                                                                    \
+            x_scale_desc,                                                                     \
+            x_zero_desc,                                                                      \
+            x_desc);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                                     \
+    case CASE:                                                                                                   \
+        *size = reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                             void *workspace,
+                                             size_t workspace_size,
+                                             void *x_packed,
+                                             void *x_scale,
+                                             void *x_zero,
+                                             const void *x,
+                                             void *stream) {
+#define QUANT(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                         \
+        return reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, x_packed, x_scale, x_zero, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        QUANT(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        QUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef QUANT
+}
+
+__C infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                              \
+        delete reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
new file mode 100644
index 000000000..4d1675c8c
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
@@ -0,0 +1,40 @@
+#ifndef __QUANT_H__
+#define __QUANT_H__
+
+#include "../../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                \
+                                                                                             \
+    namespace op::per_channel_quant_int8::NAMESPACE {                                        \
+    class Descriptor final : public InfiniopDescriptor {                                     \
+        struct Opaque;                                                                       \
+        Opaque *_opaque;                                                                     \
+        PerChannelQuantI8Info _info;                                                         \
+        size_t _workspace_size;                                                              \
+                                                                                             \
+        Descriptor(Opaque *opaque, PerChannelQuantI8Info info,                               \
+                   size_t workspace_size,                                                    \
+                   infiniDevice_t device_type, int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id},                                    \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}               \
+                                                                                             \
+    public:                                                                                  \
+        ~Descriptor();                                                                       \
+                                                                                             \
+        size_t minWorkspaceSize() const { return _workspace_size; }                          \
+                                                                                             \
+        static infiniStatus_t create(                                                        \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                  \
+            infiniopTensorDescriptor_t x_packed_desc,                                        \
+            infiniopTensorDescriptor_t x_scale_desc,                                         \
+            infiniopTensorDescriptor_t x_zero_desc,                                          \
+            infiniopTensorDescriptor_t x_desc);                                              \
+                                                                                             \
+        infiniStatus_t calculate(                                                            \
+            void *workspace, size_t workspace_size,                                          \
+            void *x_packed, void *x_scale, void *x_zero, const void *x, void *stream) const; \
+    };                                                                                       \
+    }
+
+#endif // __QUANT_H__
diff --git a/src/infiniop/ops/scaled_mm/cuda/per_channel_dequant_int8.cuh b/src/infiniop/ops/scaled_mm/cuda/per_channel_dequant_int8.cuh
new file mode 100644
index 000000000..08355461e
--- /dev/null
+++ b/src/infiniop/ops/scaled_mm/cuda/per_channel_dequant_int8.cuh
@@ -0,0 +1,80 @@
+#ifndef __PER_CHANNEL_DEQUANT_INT8_KERNEL_CUH__
+#define __PER_CHANNEL_DEQUANT_INT8_KERNEL_CUH__
+/**
+ * @brief Symmetric dequantization kernel for post-processing quantized matrix multiplication
+ *
+ * This kernel performs symmetric dequantization on the packed integer output from
+ * a quantized matrix multiplication. It converts integer results back to floating-point
+ * values by applying per-tensor scaling factors from both input and weight tensors,
+ * then adds bias terms.
+ *
+ * The dequantization formula is:
+ *   y = x_scale * w_scale * y_packed + bias
+ *
+ * @tparam Tdata Output data type (typically bfloat16 or half)
+ *
+ * @param[out] y Output tensor after dequantization
+ *               Shape: [M, N], Data type: Tdata
+ *
+ * @param[in] y_packed Packed integer output from quantized matmul
+ *                     Shape: [M, N], Data type: int32_t
+ *                     Contains integer results of: x_packed[i,:] * w_packed[:,j]
+ *
+ * @param[in] bias Bias tensor to add after dequantization
+ *                 Shape: [N], Data type: Tdata
+ *                 Broadcasted across all rows
+ *
+ * @param[in] x_packed Packed quantized input tensor (not directly used here)
+ *                     Shape: [M, K], Data type: int8_t
+ *                     Included for context of the computation pipeline
+ *
+ * @param[in] x_scale Per-tensor scaling factors for input
+ *                    Shape: [M], Data type: float
+ *                    One scale value per input row
+ *
+ * @param[in] w_packed Packed quantized weight tensor (not directly used here)
+ *                     Shape: [K, N], Data type: int8_t
+ *                     Included for context of the computation pipeline
+ *
+ * @param[in] w_scale Per-tensor scaling factors for weights
+ *                    Shape: [N], Data type: float
+ *                    One scale value per output column
+ *
+ * @param[in] M Batch size / number of input rows
+ *
+ * @param[in] K Inner dimension of matrix multiplication
+ *
+ * @param[in] N Output dimension / number of output columns
+ *
+ * @note This kernel assumes symmetric quantization (zero-point = 0)
+ * @note Each thread processes one element of the output matrix
+ * @note Grid and block dimensions should be configured to cover [M, N] output space
+ */
+template <typename Tdata>
+__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = x_scale[row] * w_scale[col] * ((float)y_packed[idx]);
+
+    float output = output1 + (float)bias[col];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+// y = x_scale * w_scale * y_packed
+template <typename Tdata>
+__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output = x_scale[row] * w_scale[col] * ((float)y_packed[idx]);
+
+    y[idx] = static_cast<Tdata>(output);
+}
+#endif // __PER_CHANNEL_DEQUANT_INT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/scaled_mm/int8_gemm.h b/src/infiniop/ops/scaled_mm/int8_gemm.h
index 8e66d5177..d5a250e66 100644
--- a/src/infiniop/ops/scaled_mm/int8_gemm.h
+++ b/src/infiniop/ops/scaled_mm/int8_gemm.h
@@ -4,43 +4,48 @@
 #include "../../operator.h"
 #include "info.h"
 
-#define DESCRIPTOR(NAMESPACE)                                                    \
-                                                                                 \
-    namespace op::i8gemm::NAMESPACE {                                            \
-    class Descriptor final : public InfiniopDescriptor {                         \
-        struct Opaque;                                                           \
-        Opaque *_opaque;                                                         \
-        size_t _workspace_size;                                                  \
-        I8GemmInfo _info;                                                        \
-        infiniDtype_t _out_dtype;                                                \
-                                                                                 \
-        Descriptor(Opaque *opaque, I8GemmInfo info,                              \
-                   size_t workspace_size,                                        \
-                   infiniDtype_t out_dtype,                                      \
-                   infiniDevice_t device_type, int device_id)                    \
-            : InfiniopDescriptor{device_type, device_id}, _out_dtype(out_dtype), \
-              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}   \
-                                                                                 \
-    public:                                                                      \
-        ~Descriptor();                                                           \
-                                                                                 \
-        size_t minWorkspaceSize() const { return _workspace_size; }              \
-                                                                                 \
-        static infiniStatus_t create(                                            \
-            infiniopHandle_t handle, Descriptor **desc_ptr,                      \
-            infiniopTensorDescriptor_t out_desc,                                 \
-            infiniopTensorDescriptor_t bias_desc,                                \
-            infiniopTensorDescriptor_t a_desc,                                   \
-            infiniopTensorDescriptor_t a_scale_desc,                             \
-            infiniopTensorDescriptor_t b_desc,                                   \
-            infiniopTensorDescriptor_t b_scale_desc);                            \
-                                                                                 \
-        infiniStatus_t calculate(                                                \
-            void *workspace, size_t workspace_size,                              \
-            void *out, const void *bias, const void *a,                          \
-            const void *a_scale, const void *b,                                  \
-            const void *b_scale, void *stream) const;                            \
-    };                                                                           \
+#define DESCRIPTOR(NAMESPACE)                                                                   \
+                                                                                                \
+    namespace op::i8gemm::NAMESPACE {                                                           \
+    class Descriptor final : public InfiniopDescriptor {                                        \
+        struct Opaque;                                                                          \
+        Opaque *_opaque;                                                                        \
+        size_t _workspace_size;                                                                 \
+        I8GemmInfo _info;                                                                       \
+        infiniDtype_t _out_dtype;                                                               \
+                                                                                                \
+        Descriptor(Opaque *opaque, I8GemmInfo info,                                             \
+                   size_t workspace_size,                                                       \
+                   infiniDtype_t out_dtype,                                                     \
+                   infiniDevice_t device_type, int device_id)                                   \
+            : InfiniopDescriptor{device_type, device_id}, _out_dtype(out_dtype),                \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}                  \
+                                                                                                \
+    public:                                                                                     \
+        ~Descriptor();                                                                          \
+                                                                                                \
+        size_t minWorkspaceSize() const { return _workspace_size; }                             \
+                                                                                                \
+        static infiniStatus_t create(                                                           \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                     \
+            infiniopTensorDescriptor_t out_desc,                                                \
+            infiniopTensorDescriptor_t bias_desc,                                               \
+            infiniopTensorDescriptor_t a_desc,                                                  \
+            infiniopTensorDescriptor_t a_scale_desc,                                            \
+            infiniopTensorDescriptor_t b_desc,                                                  \
+            infiniopTensorDescriptor_t b_scale_desc);                                           \
+        template <unsigned int BLOCK_SIZE, typename Tdata>                                      \
+        infiniStatus_t launchKernel(const I8GemmInfo &info, Tdata *y,                           \
+                                    const Tdata *bias, const int8_t *x_packed,                  \
+                                    const float *x_scale, const int8_t *w_packed,               \
+                                    const float *w_scale, void *stream, void *workspace) const; \
+                                                                                                \
+        infiniStatus_t calculate(                                                               \
+            void *workspace, size_t workspace_size,                                             \
+            void *out, const void *bias, const void *a,                                         \
+            const void *a_scale, const void *b,                                                 \
+            const void *b_scale, void *stream) const;                                           \
+    };                                                                                          \
     }
 
 #endif // __I8GEMM_H__
diff --git a/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_kernel.cuh b/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_kernel.cuh
index 4cf8fc58d..e7e076390 100644
--- a/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_kernel.cuh
+++ b/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_kernel.cuh
@@ -140,20 +140,9 @@ void cutlass_int8_scaled_mm(
     typename Gemm::Arguments args{
         {m, n, k}, {a_ptr, lda}, {b_ptr, ldb}, {b_s_ptr, 0}, {a_s_ptr, 0}, {bias_ptr, ldc}, {o_ptr, ldd}, visitor_args};
 
-    /* 需要先看看是否需要workspace */
-    // auto workspace = torch::empty(
-    //     gemm_op.get_workspace_size(args), torch::TensorOptions().dtype(torch::kUInt8).device(mat_a.device()));
-
-    // auto can_implement = gemm_op.can_implement(args);
     check_cutlass_status(gemm_op.can_implement(args));
-    // TORCH_CHECK(
-    //     can_implement == cutlass::Status::kSuccess,
-    //     "gemm cannot implement, error: ",
-    //     cutlassGetStatusString(can_implement));
-
     auto status = gemm_op(args, nullptr, (cudaStream_t)stream);
     check_cutlass_status(status);
-    // TORCH_CHECK(status == cutlass::Status::kSuccess, "gemm executioin failed, error: ", cutlassGetStatusString(status));
 }
 
 template <typename ElementOutput, typename ArchTag, typename InstructionShape>
diff --git a/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu b/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
index 3518b53fb..c429408f1 100644
--- a/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
+++ b/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
@@ -1,9 +1,23 @@
-#ifdef ENABLE_CUTLASS_API
+
 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#ifdef ENABLE_CUTLASS_API
 #include "int8_gemm_kernel.cuh"
+#endif
+#include "../cuda/per_channel_dequant_int8.cuh"
 #include "int8_gemm_nvidia.cuh"
 
+template <typename Tdata>
+INFINIOP_CUDA_KERNEL postSym(
+    Tdata *y, int32_t *y_packed, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
+    postSymKernel<Tdata>(y, y_packed, bias, x_packed, x_scale, w_packed, w_scale, M, K, N);
+}
+template <typename Tdata>
+INFINIOP_CUDA_KERNEL postSym(
+    Tdata *y, int32_t *y_packed, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
+    postSymKernel<Tdata>(y, y_packed, x_packed, x_scale, w_packed, w_scale, M, K, N);
+}
+
 namespace op::i8gemm::nvidia {
 
 struct Descriptor::Opaque {
@@ -14,6 +28,7 @@ Descriptor::~Descriptor() {
     delete _opaque;
 }
 
+#ifdef ENABLE_NVIDIA_API
 inline int getSMVersion() {
     int device{-1};
     CHECK_CUDA(cudaGetDevice(&device));
@@ -23,6 +38,7 @@ inline int getSMVersion() {
     CHECK_CUDA(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
     return sm_major * 10 + sm_minor;
 }
+#endif
 
 infiniStatus_t Descriptor::create(
     infiniopHandle_t handle_,
@@ -40,14 +56,63 @@ infiniStatus_t Descriptor::create(
 
     auto result = I8GemmInfo::create(out_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
     CHECK_RESULT(result);
-
+    size_t workspace_size = out_desc->dim(0) * out_desc->dim(1) * sizeof(int32_t);
     *desc_ptr = new Descriptor(
         new Opaque{handle->internal()},
-        result.take(), 0, dtype,
+        result.take(), workspace_size, dtype,
         handle->device, handle->device_id);
     return INFINI_STATUS_SUCCESS;
 }
 
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, void *stream_, void *workspace) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    int M = (int)info.m;
+    int K = (int)info.k;
+    int N = (int)info.n;
+
+    char *workspace_ptr = reinterpret_cast<char *>(workspace);
+    int32_t *y_packed = reinterpret_cast<int32_t *>(workspace_ptr);
+    const int32_t alpha_I = 1;
+    const int32_t beta_I = 0;
+    int lda = K; // w_packed is column-major [K, N]
+    int ldb = K; // x_packed is row-major [M, K]
+    int ldc = N; // y_packed is row-major [M, N]
+    CHECK_STATUS(this->_opaque->internal->useCublas(
+        stream,
+        [&](cublasHandle_t handle) {
+            CHECK_CUBLAS(cublasGemmEx(
+                handle,
+                CUBLAS_OP_T, // A = w_packed^T : [N, K]
+                CUBLAS_OP_N, // B = x_packed^T viewed column-major : [K, M]
+                N,           // m
+                M,           // n
+                K,           // k
+                &alpha_I,
+                w_packed, CUDA_R_8I, lda,
+                x_packed, CUDA_R_8I, ldb,
+                &beta_I,
+                y_packed, CUDA_R_32I, ldc,
+                CUBLAS_COMPUTE_32I,
+                CUBLAS_GEMM_DEFAULT));
+            return INFINI_STATUS_SUCCESS;
+        }));
+    constexpr unsigned int BLOCK_SIZE_x = 32;
+    constexpr unsigned int BLOCK_SIZE_y = 32;
+
+    int num_block_x = (N + BLOCK_SIZE_x - 1) / BLOCK_SIZE_x;
+    int num_block_y = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+    dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+    dim3 grid_dim(num_block_x, num_block_y, 1);
+    if (bias == nullptr) {
+        postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, x_packed, x_scale, w_packed, w_scale, M, K, N);
+    } else {
+        postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, bias, x_packed, x_scale, w_packed, w_scale, M, K, N);
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
 infiniStatus_t Descriptor::calculate(
     void *workspace,
     size_t workspace_size,
@@ -58,6 +123,7 @@ infiniStatus_t Descriptor::calculate(
     const void *b,
     const void *b_scale,
     void *stream) const {
+#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
     auto sm_version = getSMVersion();
     if (sm_version >= 75 && sm_version < 80) {
         CHECK_DTYPE(this->_out_dtype, INFINI_DTYPE_F16);
@@ -111,7 +177,30 @@ infiniStatus_t Descriptor::calculate(
     } else {
         return INFINI_STATUS_NOT_IMPLEMENTED;
     }
+#elif defined ENABLE_QY_API
+#define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \
+    launchKernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)out, (const TDATA *)bias, (const int8_t *)a, (const float *)a_scale, (const int8_t *)b, (const float *)b_scale, stream, workspace)
+#define CALCULATE_LINEAR_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                           \
+        if (this->_out_dtype == INFINI_DTYPE_F16)               \
+            return CALCULATE_LINEAR(BLOCK_SIZE, half);          \
+        else if (this->_out_dtype == INFINI_DTYPE_F32)          \
+            return CALCULATE_LINEAR(BLOCK_SIZE, float);         \
+        else if (this->_out_dtype == INFINI_DTYPE_BF16)         \
+            return CALCULATE_LINEAR(BLOCK_SIZE, __nv_bfloat16); \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+#endif
     return INFINI_STATUS_SUCCESS;
 }
 } // namespace op::i8gemm::nvidia
-#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/scaled_mm/operator.cc b/src/infiniop/ops/scaled_mm/operator.cc
index 326d8697c..9d51708c8 100644
--- a/src/infiniop/ops/scaled_mm/operator.cc
+++ b/src/infiniop/ops/scaled_mm/operator.cc
@@ -2,7 +2,7 @@
 #include "../../handle.h"
 #include "infiniop/ops/int8_gemm.h"
 
-#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 #include "nvidia/int8_gemm_nvidia.cuh"
 #endif
 
@@ -26,8 +26,11 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
             b_desc,                                                           \
             b_scale_desc);
     switch (handle->device) {
-#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
+#if defined(ENABLE_NVIDIA_API)
         CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#if defined(ENABLE_QY_API)
+        CREATE(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -41,8 +44,11 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
     case CASE:                                                                                   \
         *size = reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
         return INFINI_STATUS_SUCCESS;
-#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
+#if defined(ENABLE_NVIDIA_API)
         GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#if defined(ENABLE_QY_API)
+        GET(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -65,8 +71,11 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
         return reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->calculate( \
             workspace, workspace_size, out, bias, a, a_scale, b, b_scale, stream);
     switch (desc->device_type) {
-#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
+#if defined(ENABLE_NVIDIA_API)
         CACULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#if defined(ENABLE_QY_API)
+        CACULATE(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -80,8 +89,11 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
         delete reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
     switch (desc->device_type) {
-#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
+#if defined(ENABLE_NVIDIA_API)
         DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#if defined(ENABLE_QY_API)
+        DESTROY(INFINI_DEVICE_QY, nvidia)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 7d6cf17e2..2058e2766 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -725,6 +725,41 @@ def dequantize_(lib):
     ]
 
 
+@OpRegister.operator
+def per_channel_quant_int8_(lib):
+    lib.infiniopCreatePerChannelQuantI8Descriptor.restype = c_int32
+    lib.infiniopCreatePerChannelQuantI8Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetPerChannelQuantI8WorkspaceSize.restype = c_int32
+    lib.infiniopGetPerChannelQuantI8WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPerChannelQuantI8.restype = c_int32
+    lib.infiniopPerChannelQuantI8.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPerChannelQuantI8Descriptor.restype = c_int32
+    lib.infiniopDestroyPerChannelQuantI8Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
diff --git a/test/infiniop/per_channel_quant_int8.py b/test/infiniop/per_channel_quant_int8.py
new file mode 100644
index 000000000..dcbf9d1f8
--- /dev/null
+++ b/test/infiniop/per_channel_quant_int8.py
@@ -0,0 +1,189 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # x_shape, w_shape, symmetric, bias_exit, y_shape
+    ((8, 8), True),
+    ((128, 512), True),
+    ((128, 128), True),
+    ((256, 1024), False),
+    ((256, 2048), True),
+    ((1024, 2048), False),
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def per_token_quant_int8_torch(x, symmetric):
+    if symmetric:
+        x = x.float()
+        absmax = x.abs().max(dim=-1).values
+        absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
+        scale_x = absmax / 127
+        x_q = x.mul(127 / absmax)
+        x_q = torch.round(x_q).to(torch.int8)
+
+        return x_q, scale_x, None
+    else:
+        w = x.float()
+        w_min = w.min(dim=-1, keepdim=True)[0]
+        w_max = w.max(dim=-1, keepdim=True)[0]
+
+        w_scale = (w_max - w_min) / 255.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+
+        w_zero = -w_min / w_scale - 128.0
+
+        w_q = torch.round(w / w_scale + w_zero)
+
+        w_q = torch.clamp(w_q, -128, 127)
+
+        w_packed = w_q.to(torch.int8)
+
+        return w_packed, w_scale, w_zero
+
+def test(
+    handle,
+    device,
+    x_shape,
+    symmetric,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    
+    print(
+        f"Testing Per Channel Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+   
+    x = TestTensor(x_shape, None, dtype, device)
+    x_p, x_s, x_z = per_token_quant_int8_torch(x.torch_tensor(), symmetric)
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+    
+    def lib_per_channel_quant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerChannelQuantI8(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_per_channel_quant_int8()
+    
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x_packed.actual_tensor(), x_p, atol=atol, rtol=rtol)
+        debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+        if symmetric == False:
+            debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
+    
+    if symmetric:
+        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and 
+                torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol))
+    else:
+        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and 
+                torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and
+                torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol))
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: per_token_quant_int8_torch(x.torch_tensor(), symmetric), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_per_channel_quant_int8(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/w8a8int8.py b/test/infiniop/w8a8int8.py
new file mode 100644
index 000000000..a5a0ce62d
--- /dev/null
+++ b/test/infiniop/w8a8int8.py
@@ -0,0 +1,344 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # x_shape = [M,K], w_shape = [N, K], sym, y_shape = [M, N]
+    ((100, 3584), (10752, 3584), True, (100, 10752)),
+    ((1000, 3584), (10752, 3584), True, (1000, 10752)),
+    ((1, 3584), (10752, 3584), True, (1, 10752)),
+    ((2000, 3584), (10752, 3584), True, (2000, 10752)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 3e-1, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 3e-1, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def mm(x, w, bias, out_dtype):
+    return (torch.matmul(x, w + bias)).to(out_dtype)
+
+
+def scaled_mm(x, w_p, w_s, bias, out_dtype):
+    return (
+        torch.matmul(x.to(torch.float32), w_p.to(torch.float32)) * w_s.view(1, -1)
+        + bias
+    ).to(out_dtype)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
+    o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
+    if bias is not None:
+        o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1) + bias
+    else:
+        o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1)
+    return o.to(out_dtype)
+
+
+def per_token_quant_int8_torch(x):
+    x = x.float()
+    absmax = x.abs().max(dim=-1).values
+    absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
+    scale_x = absmax / 127
+    x_q = x.mul(127 / absmax)
+    x_q = torch.round(x_q).to(torch.int8)
+
+    return x_q, scale_x
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    w_shape,
+    symmetric,
+    y_shape,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.BF16,
+    sync=None,
+):
+    print(
+        f"Testing Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+    N = w_shape[0]
+
+    x = TestTensor(x_shape, None, dtype, device)
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    dev = x.torch_tensor().device
+    weights_packed = to_int8(torch.randn(w_shape, device=dev).t() * 5)
+    weights_scale = torch.randn((N, 1), device=dev, dtype=torch.float32)
+    bias = (
+        torch.randn(
+            (N,),
+            device=dev,
+            dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+        )
+        * 10
+    )
+    
+    w_packed = TestTensor(
+        (K, N),
+        weights_packed.stride(),
+        InfiniDtype.I8,
+        device,
+        mode="manual",
+        set_tensor=weights_packed,
+    )
+    w_scale = TestTensor(
+        (N, 1),
+        weights_scale.stride(),
+        InfiniDtype.F32,
+        device,
+        mode="manual",
+        set_tensor=weights_scale,
+    )
+
+    weights = w_packed.torch_tensor() * w_scale.torch_tensor().view(1, -1)
+
+    y = TestTensor(y_shape, None, dtype, device)
+    bias = TestTensor(
+        (N,), bias.stride(), dtype, device, mode="manual", set_tensor=bias
+    )
+
+    x_mm = x.torch_tensor().to(
+        torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16
+    )
+    w_mm = weights.to(torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16)
+
+    quant_descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
+            handle,
+            ctypes.byref(quant_descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None,
+            x.descriptor,
+        )
+    )
+
+    quant_workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
+            quant_descriptor, ctypes.byref(quant_workspace_size)
+        )
+    )
+    quant_workspace = TestWorkspace(quant_workspace_size.value, x.device)
+
+    def lib_per_channel_quant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerChannelQuantI8(
+                quant_descriptor,
+                quant_workspace.data(),
+                quant_workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None,
+                x.data(),
+                None,
+            )
+        )
+
+    
+    scaled_mm_descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateI8GemmDescriptor(
+            handle,
+            ctypes.byref(scaled_mm_descriptor),
+            y.descriptor,
+            bias.descriptor,
+            x_packed.descriptor,
+            x_scale.descriptor,
+            w_packed.descriptor,
+            w_scale.descriptor,
+        )
+    )
+
+    scaled_mm_workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetI8GemmWorkspaceSize(
+            scaled_mm_descriptor, ctypes.byref(scaled_mm_workspace_size)
+        )
+    )
+    scaled_mm_workspace = TestWorkspace(scaled_mm_workspace_size.value, x_packed.device)
+
+    def lib_linear():
+        check_error(
+            LIBINFINIOP.infiniopI8Gemm(
+                scaled_mm_descriptor,
+                scaled_mm_workspace.data(),
+                scaled_mm_workspace_size.value,
+                y.data(),
+                bias.data(),
+                x_packed.data(),
+                x_scale.data(),
+                w_packed.data(),
+                w_scale.data(),
+                None,
+            )
+        )
+    
+    def lib_w8a8int8_linearFunction():
+        lib_per_channel_quant_int8()
+        lib_linear()
+
+    def lib_torch_mm():
+        mm(
+            x_mm,
+            w_mm,
+            bias.torch_tensor(),
+            out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+        )
+
+    x_p, x_s = per_token_quant_int8_torch(x.torch_tensor())
+    lib_w8a8int8_linearFunction()
+
+    scaled_mm_torch = torch_scaled_mm(
+        x_p,
+        w_packed.torch_tensor(),
+        x_s,
+        w_scale.torch_tensor(),
+        torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+        bias=bias.torch_tensor(),
+    )
+    mm_torch = scaled_mm(
+        x.torch_tensor(),
+        w_packed.torch_tensor(),
+        w_scale.torch_tensor(),
+        bias.torch_tensor(),
+        out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+    )
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), mm_torch, atol=atol, rtol=rtol)
+    
+    # The quantization test did not normalize the test data, leading to large errors; the error check has been temporarily removed.
+
+    def profile_operation(name, func, device, num_prerun, num_iterations):
+        # Warm up
+        for _ in range(num_prerun):
+            func()
+
+        torch.cuda.synchronize()
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for _ in range(num_iterations):
+            func()
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed = start.elapsed_time(end)
+        print(
+            f"{name} took {elapsed / num_iterations:.6f} ms over {num_iterations} iterations"
+        )
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch mm       ",
+            lambda: lib_torch_mm(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "lib total        ",
+            lambda: lib_w8a8int8_linearFunction(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "lib quant        ",
+            lambda: lib_per_channel_quant_int8(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "lib scaled mm    ",
+            lambda: lib_linear(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+    
+    check_error(LIBINFINIOP.infiniopDestroyI8GemmDescriptor(scaled_mm_descriptor))
+    
+    check_error(
+        LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(quant_descriptor)
+    )
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/third_party/nlohmann_json b/third_party/nlohmann_json
new file mode 160000
index 000000000..55f93686c
--- /dev/null
+++ b/third_party/nlohmann_json
@@ -0,0 +1 @@
+Subproject commit 55f93686c01528224f448c19128836e7df245f72
diff --git a/xmake.lua b/xmake.lua
index e211347cb..05ecee7a9 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -11,6 +11,7 @@ set_encodings("utf-8")
 
 add_includedirs("include")
 add_includedirs("third_party/spdlog/include")
+add_includedirs("third_party/nlohmann_json/single_include/")
 
 if is_mode("debug") then
     add_defines("DEBUG_MODE")
@@ -330,6 +331,7 @@ target("infiniop")
     if has_config("qy-gpu") then
         add_deps("infiniop-qy")
         add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true})
+        add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/*/nvidia/*.cu.o", {public = true})
         add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true})
     end
 
@@ -353,7 +355,7 @@ target("infiniop")
     end
     set_languages("cxx17")
     add_files("src/infiniop/devices/handle.cc")
-    add_files("src/infiniop/ops/*/operator.cc")
+    add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
     add_files("src/infiniop/*.cc")
 
     set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index 5e9eef5f3..648a12723 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -71,7 +71,7 @@ target("infiniop-nvidia")
     end
 
     set_languages("cxx17")
-    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")
diff --git a/xmake/qy.lua b/xmake/qy.lua
index bd591249a..810f88c2f 100644
--- a/xmake/qy.lua
+++ b/xmake/qy.lua
@@ -99,7 +99,7 @@ target("infiniop-qy")
     add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
 
     set_languages("cxx17")
-    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")