diff --git a/.gitmodules b/.gitmodules index eab6041af..bca919479 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,7 @@ [submodule "third_party/spdlog"] path = third_party/spdlog url = https://github.com/gabime/spdlog.git +[submodule "third_party/nlohmann_json"] + path = third_party/nlohmann_json + url = https://github.com/nlohmann/json.git + branch = master diff --git a/include/infinicore.hpp b/include/infinicore.hpp index 95e4243d9..a7bd1a497 100644 --- a/include/infinicore.hpp +++ b/include/infinicore.hpp @@ -3,4 +3,5 @@ #include "infinicore/device_event.hpp" #include "infinicore/nn.hpp" #include "infinicore/ops.hpp" +#include "infinicore/quantization.hpp" #include "infinicore/tensor.hpp" diff --git a/include/infinicore/nn/linear.hpp b/include/infinicore/nn/linear.hpp index e77a432c2..667a980fa 100644 --- a/include/infinicore/nn/linear.hpp +++ b/include/infinicore/nn/linear.hpp @@ -1,8 +1,10 @@ #pragma once #include "../ops.hpp" +#include "../quantization.hpp" #include "module.hpp" #include +#include namespace infinicore::nn { @@ -11,6 +13,9 @@ class BaseLinear : public Module { BaseLinear(size_t in_features, size_t out_features, bool bias = true, const DataType &dtype = DataType::F32, const Device &device = Device()); + BaseLinear(size_t in_features, size_t out_features, std::shared_ptr quantization, bool bias = true, + const DataType &dtype = DataType::F32, const Device &device = Device()); + // Forward pass: output = input @ weight.T + bias Tensor forward(Tensor &input) const; @@ -27,12 +32,17 @@ class BaseLinear : public Module { // Accessors for parameters Tensor weight() const { return weight_; } Tensor bias() const { return bias_; } + Tensor weight_scale() const { return weight_scale_; } + Tensor weight_zeros() const { return weight_zeros_; } protected: // Parameters INFINICORE_NN_PARAMETER(weight); INFINICORE_NN_PARAMETER(bias); + INFINICORE_NN_PARAMETER(weight_scale); + INFINICORE_NN_PARAMETER(weight_zeros); + protected: // Helper method for common forward computation Tensor compute_linear(Tensor &input) const; @@ -41,6 +51,7 @@ class BaseLinear : public Module { size_t out_features_; bool has_bias_; DataType dtype_; + std::shared_ptr quantization_ = std::make_shared(nullptr); }; } // namespace infinicore::nn @@ -52,6 +63,9 @@ class Linear : public BaseLinear { Linear(size_t in_features, size_t out_features, bool bias = true, const DataType &dtype = DataType::F32, const Device &device = Device()); + Linear(size_t in_features, size_t out_features, std::shared_ptr quantization, bool bias = true, + const DataType &dtype = DataType::F32, const Device &device = Device()); + // Forward pass: output = input @ weight.T + bias Tensor forward(Tensor &input) const; @@ -65,6 +79,10 @@ class ColumnParallelLinear : public BaseLinear { const DataType &dtype = DataType::F32, const Device &device = Device(), Size tp_rank = 0, Size tp_size = 1); + ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr quantization, bool bias = true, + const DataType &dtype = DataType::F32, const Device &device = Device(), + Size tp_rank = 0, Size tp_size = 1); + // Forward pass: output = input @ weight.T + bias Tensor forward(Tensor &input) const; @@ -82,6 +100,10 @@ class RowParallelLinear : public BaseLinear { const DataType &dtype = DataType::F32, const Device &device = Device(), Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr); + RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr quantization, bool bias = true, + const DataType &dtype = DataType::F32, const Device &device = Device(), + Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr); + // Forward pass: output = input @ weight.T + bias Tensor forward(Tensor &input) const; diff --git a/include/infinicore/ops/dequantize_awq.hpp b/include/infinicore/ops/dequantize_awq.hpp new file mode 100644 index 000000000..50e4328f3 --- /dev/null +++ b/include/infinicore/ops/dequantize_awq.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { +INFINICORE_GRAPH_OP_CLASS(DequantizeAWQ, Tensor, const Tensor &, const Tensor &, const Tensor &); + +void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros); +} // namespace infinicore::op diff --git a/include/infinicore/ops/linear_w4a16_awq.hpp b/include/infinicore/ops/linear_w4a16_awq.hpp new file mode 100644 index 000000000..ebae7a685 --- /dev/null +++ b/include/infinicore/ops/linear_w4a16_awq.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include "common/op.hpp" +#include + +namespace infinicore::op { + +Tensor linear_w4a16_awq(Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional bias); + +void linear_w4a16_awq_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional bias); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/linear_w8a8i8.hpp b/include/infinicore/ops/linear_w8a8i8.hpp new file mode 100644 index 000000000..08cadc111 --- /dev/null +++ b/include/infinicore/ops/linear_w8a8i8.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "../graph/graph.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +Tensor linear_w8a8i8(Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional bias); + +void linear_w8a8i8_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional bias); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/per_channel_quant_i8.hpp b/include/infinicore/ops/per_channel_quant_i8.hpp new file mode 100644 index 000000000..0b0296248 --- /dev/null +++ b/include/infinicore/ops/per_channel_quant_i8.hpp @@ -0,0 +1,12 @@ +#pragma once +#include "../device.hpp" +#include "../graph/graph.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_CLASS(PerChannelQuantI8, const Tensor &, Tensor, Tensor); + +void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale); +} // namespace infinicore::op diff --git a/include/infinicore/ops/scaled_mm_i8.hpp b/include/infinicore/ops/scaled_mm_i8.hpp new file mode 100644 index 000000000..427ed9d65 --- /dev/null +++ b/include/infinicore/ops/scaled_mm_i8.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "../device.hpp" +#include "../graph/graph.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_CLASS(I8Gemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, std::optional); + +void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional bias); +} // namespace infinicore::op diff --git a/include/infinicore/quantization.hpp b/include/infinicore/quantization.hpp new file mode 100644 index 000000000..7b01312ba --- /dev/null +++ b/include/infinicore/quantization.hpp @@ -0,0 +1,7 @@ +#pragma once + +#include "quantization/awq.hpp" +#include "quantization/base_quantization.hpp" +#include "quantization/compressed_tensors.hpp" +#include "quantization/none_quantizaiton.hpp" +#include "quantization/quantization_scheme.hpp" diff --git a/include/infinicore/quantization/awq.hpp b/include/infinicore/quantization/awq.hpp new file mode 100644 index 000000000..bbbbab1cb --- /dev/null +++ b/include/infinicore/quantization/awq.hpp @@ -0,0 +1,19 @@ +#pragma once +#include "base_quantization.hpp" +namespace infinicore::quantization { + +class AWQ : public BaseQuantization { + // This is a temporary class that currently only returns AWQ_W4A16. + // Future enhancements should parse quant_config to extract detailed quantization + // information and support multiple quantization schemes. +public: + explicit AWQ(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) {}; + + infinicore::quantization::QuantScheme + get_quant_scheme() const override { + return infinicore::quantization::QuantScheme::AWQ_W4A16; + }; +}; + +} // namespace infinicore::quantization diff --git a/include/infinicore/quantization/base_quantization.hpp b/include/infinicore/quantization/base_quantization.hpp new file mode 100644 index 000000000..4cc9b325e --- /dev/null +++ b/include/infinicore/quantization/base_quantization.hpp @@ -0,0 +1,17 @@ +#pragma once +#include "nlohmann/json.hpp" +#include "quantization_scheme.hpp" + +namespace infinicore::quantization { +class BaseQuantization { + // Base class for quantization schemes. Intended to be extended to support various quantization methods. +public: + explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; + virtual ~BaseQuantization() = default; + + virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0; + +protected: + nlohmann::json quant_config_; +}; +} // namespace infinicore::quantization diff --git a/include/infinicore/quantization/compressed_tensors.hpp b/include/infinicore/quantization/compressed_tensors.hpp new file mode 100644 index 000000000..0e3e45512 --- /dev/null +++ b/include/infinicore/quantization/compressed_tensors.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "base_quantization.hpp" +namespace infinicore::quantization { + +class CompressedTensors : public BaseQuantization { + // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8. + // Future enhancements should parse quant_config to extract detailed quantization + // information and support multiple quantization schemes. +public: + explicit CompressedTensors(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) {}; + + infinicore::quantization::QuantScheme + get_quant_scheme() const override { + return infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8; + }; +}; + +} // namespace infinicore::quantization diff --git a/include/infinicore/quantization/none_quantizaiton.hpp b/include/infinicore/quantization/none_quantizaiton.hpp new file mode 100644 index 000000000..be5e4b377 --- /dev/null +++ b/include/infinicore/quantization/none_quantizaiton.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "base_quantization.hpp" +namespace infinicore::quantization { + +class NoneQuantization : public BaseQuantization { + // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8. + // Future enhancements should parse quant_config to extract detailed quantization + // information and support multiple quantization schemes. +public: + explicit NoneQuantization(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) {}; + + infinicore::quantization::QuantScheme + get_quant_scheme() const override { + return infinicore::quantization::QuantScheme::NONE; + }; +}; + +} // namespace infinicore::quantization diff --git a/include/infinicore/quantization/quantization_scheme.hpp b/include/infinicore/quantization/quantization_scheme.hpp new file mode 100644 index 000000000..b4a8bc29c --- /dev/null +++ b/include/infinicore/quantization/quantization_scheme.hpp @@ -0,0 +1,12 @@ +// quant.hpp +#pragma once + +namespace infinicore::quantization { + +enum class QuantScheme { + NONE, + COMPRESSED_TENSOR_W8A8I8, + AWQ_W4A16, +}; + +} // namespace infinicore::quantization diff --git a/include/infiniop.h b/include/infiniop.h index 0ea2e2bc0..f03832b43 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -13,6 +13,7 @@ #include "infiniop/ops/flash_attention.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/int8_gemm.h" #include "infiniop/ops/kv_caching.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" @@ -22,6 +23,7 @@ #include "infiniop/ops/paged_attention.h" #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" +#include "infiniop/ops/quant/per_channel_quant_int8.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" diff --git a/include/infiniop/ops/quant/per_channel_quant_int8.h b/include/infiniop/ops/quant/per_channel_quant_int8.h new file mode 100644 index 000000000..ce21f4556 --- /dev/null +++ b/include/infiniop/ops/quant/per_channel_quant_int8.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__ +#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__ + +#include "../../operator_descriptor.h" + +typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t; + +__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle, + infiniopPerChannelQuantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc); + +__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *x_packed, + void *x_scale, + void *x_zero, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc); + +#endif diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index e1ae309f5..bae47e33c 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -2,6 +2,7 @@ from .embedding import embedding from .flash_attention import flash_attention from .linear import linear +from .linear_w8a8i8 import linear_w8a8i8 from .random_sample import random_sample from .rms_norm import rms_norm from .rope import RopeAlgo, rope @@ -19,4 +20,5 @@ "rope", "silu", "swiglu", + "linear_w8a8i8", ] diff --git a/python/infinicore/nn/functional/linear_w8a8i8.py b/python/infinicore/nn/functional/linear_w8a8i8.py new file mode 100644 index 000000000..33cb59b0e --- /dev/null +++ b/python/infinicore/nn/functional/linear_w8a8i8.py @@ -0,0 +1,31 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def linear_w8a8i8( + input: Tensor, + weight_packed: Tensor, + weight_scale: Tensor, + bias=None, + out=None, +) -> Tensor: + r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale.""" + + if out is None: + return Tensor( + _infinicore.linear_w8a8i8( + input._underlying, + weight_packed._underlying, + weight_scale._underlying, + None if bias is None else bias._underlying, + ) + ) + + _infinicore.linear_w8a8i8_( + out._underlying, + input._underlying, + weight_packed._underlying, + weight_scale._underlying, + None if bias is None else bias._underlying, + ) + return out diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc index 0be993699..d8e5a1c76 100644 --- a/src/infinicore/nn/linear.cc +++ b/src/infinicore/nn/linear.cc @@ -3,6 +3,7 @@ #include "infinicore/ops.hpp" #include "infinicore/ops/distributed/allreduce.hpp" #include "infinicore/ops/linear.hpp" +#include "infinicore/ops/linear_w8a8i8.hpp" #include #include @@ -18,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias, device_ = device; } -Tensor BaseLinear::compute_linear(Tensor &input) const { +BaseLinear::BaseLinear(size_t in_features, size_t out_features, std::shared_ptr quantization, bool bias, + const DataType &dtype, const Device &device) + : in_features_(in_features), + out_features_(out_features), + quantization_(quantization), + has_bias_(bias), + dtype_(dtype) { - // Ensure input is contiguous before creating views (required for matmul) - // This prevents hanging when input tensor has non-contiguous memory layout - Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous(); + device_ = device; +} - // Use ops::linear_ directly to match Python backend's exact code path - // This ensures identical computation and numerical results - // Parameter inherits from Tensor, so we cast to Tensor explicitly - Tensor weight_tensor = static_cast(weight_); - std::optional bias_opt = has_bias_ ? std::make_optional(static_cast(bias_)) : std::nullopt; +Tensor BaseLinear::compute_linear(Tensor &input) const { + switch (this->quantization_->get_quant_scheme()) { + case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { + Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous(); - auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt); - return output; -} + Tensor weight_packed_tensor = static_cast(weight_); + Tensor weight_scale_tensor = static_cast(weight_scale_); + // weight_packed should be transposed and non-contiguous. + std::optional bias_opt = has_bias_ ? std::make_optional(static_cast(bias_)) : std::nullopt; + + auto output = infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight_packed_tensor, weight_scale_tensor, bias_opt); + return output; + } + default: { + // Ensure input is contiguous before creating views (required for matmul) + // This prevents hanging when input tensor has non-contiguous memory layout + Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous(); + + // Use ops::linear_ directly to match Python backend's exact code path + // This ensures identical computation and numerical results + // Parameter inherits from Tensor, so we cast to Tensor explicitly + Tensor weight_tensor = static_cast(weight_); + std::optional bias_opt = has_bias_ ? std::make_optional(static_cast(bias_)) : std::nullopt; + + auto output = infinicore::op::linear(input_contiguous->contiguous(), weight_tensor->contiguous(), bias_opt); + return output; + } + } +} // namespace infinicore::nn Tensor BaseLinear::forward(Tensor &input) const { return compute_linear(input); @@ -71,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, // in_features, out_features, bias, static_cast(dtype_)); } +Linear::Linear(size_t in_features, size_t out_features, + std::shared_ptr quantization, bool bias, + const DataType &dtype, const Device &device) + : BaseLinear(in_features, out_features, quantization, bias, dtype, device_) { + + device_ = device; + + switch (this->quantization_->get_quant_scheme()) { + case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { + INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device)); + INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device)); + + if (bias) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device)); + } else { + bias_ = Parameter(); + } + break; + } + default: { + // Initialize parameters using macro + INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device)); + + // Register bias parameter if requested + if (bias) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device)); + } else { + bias_ = Parameter(); // Default constructed empty parameter + } + + // SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}", + // in_features, out_features, bias, static_cast(dtype_)); + break; + } + } +} + Tensor Linear::forward(Tensor &input) const { return BaseLinear::forward(input); } @@ -105,6 +168,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur } } +ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr quantization, bool bias, + const DataType &dtype, const Device &device, + Size tp_rank, Size tp_size) + : BaseLinear(in_features, out_features, quantization, bias, dtype, device_), + tp_rank_(tp_rank), + tp_size_(tp_size) { + + device_ = device; + + switch (this->quantization_->get_quant_scheme()) { + case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { + + INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 0, tp_rank_, tp_size_)); + INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, tp_rank_, tp_size_)); + + if (bias) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1)); + } else { + bias_ = Parameter(); + } + break; + } + default: { + // Initialize parameters using macro + INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, + 0, tp_rank_, tp_size_)); + + // Register bias parameter if requested + if (bias) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, + 0, tp_rank_, tp_size_)); + } else { + bias_ = Parameter(); // Default constructed empty parameter + } + break; + } + } +} + Tensor ColumnParallelLinear::forward(Tensor &input) const { return BaseLinear::forward(input); } @@ -138,6 +240,46 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo } } +RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr quantization, bool bias, + const DataType &dtype, const Device &device, + Size tp_rank, Size tp_size, infinicclComm_t communicator) + : BaseLinear(in_features, out_features, quantization, bias, dtype, device_), + tp_rank_(tp_rank), + tp_size_(tp_size), communicator_(communicator) { + + device_ = device; + + switch (this->quantization_->get_quant_scheme()) { + case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { + INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, infinicore::DataType::I8, device, 1, tp_rank_, tp_size_)); + INFINICORE_NN_PARAMETER_INIT(weight_scale, ({out_features, 1}, infinicore::DataType::F32, device, 0, 0, 1)); + + if (bias) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_)); + } else { + bias_ = Parameter(); + } + break; + } + default: { + // Initialize parameters using macro + INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, + 1, tp_rank_, tp_size_)); + + // Register bias parameter if requested + if (bias && (0 == tp_rank_)) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1)); + } else { + bias_ = Parameter(); // Default constructed empty parameter + } + + // SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}", + // in_features, out_features, bias, static_cast(dtype_)); + break; + } + } +} + Tensor RowParallelLinear::forward(Tensor &input) const { auto output = BaseLinear::forward(input); diff --git a/src/infinicore/ops/dequantize_awq/dequantize_awq.cc b/src/infinicore/ops/dequantize_awq/dequantize_awq.cc new file mode 100644 index 000000000..dff92b6ec --- /dev/null +++ b/src/infinicore/ops/dequantize_awq/dequantize_awq.cc @@ -0,0 +1,20 @@ +#include "infinicore/ops/dequantize_awq.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(DequantizeAWQ); + +DequantizeAWQ::DequantizeAWQ(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zeros); + INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zeros); +} + +void DequantizeAWQ::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) { + INFINICORE_GRAPH_OP_RECORD_OR_RUN(DequantizeAWQ, x, x_packed, x_scale, x_zeros); +} + +void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) { + DequantizeAWQ::execute(x, x_packed, x_scale, x_zeros); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc b/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc new file mode 100644 index 000000000..3e643ee40 --- /dev/null +++ b/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc @@ -0,0 +1,56 @@ +#include "../../utils.hpp" +#include "../infiniop_impl.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/dequantize_awq.hpp" +#include + +namespace infinicore::op::dequantize_awq_impl::infiniop { + +INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, DequantizeAWQ, 100); + +struct PlannedMeta { + std::shared_ptr descriptor; + graph::GraphTensor workspace, x, x_packed, x_scale, x_zeros; +}; + +void *plan(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) { + size_t seed = hash_combine(x, x_packed, x_scale, x_zeros); + + INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE( + Descriptor, descriptor, DequantizeAWQ, + seed, + x->desc(), x_packed->desc(), x_scale->desc(), x_zeros->desc()); + + INFINIOP_WORKSPACE_TENSOR(workspace, DequantizeAWQ, descriptor); + + return new PlannedMeta{ + descriptor, + graph::GraphTensor(workspace), + graph::GraphTensor(x), + graph::GraphTensor(x_packed), + graph::GraphTensor(x_scale), + graph::GraphTensor(x_zeros)}; +} + +void run(void *planned_meta) { + auto planned = reinterpret_cast(planned_meta); + + INFINICORE_CHECK_ERROR(infiniopDequantizeAWQ( + planned->descriptor->desc, + planned->workspace->data(), + planned->workspace->numel(), + planned->x->data(), + planned->x_packed->data(), + planned->x_scale->data(), + planned->x_zeros->data(), + context::getStream())); +} + +void cleanup(void **planned_meta_ptr) { + delete *reinterpret_cast(planned_meta_ptr); + *planned_meta_ptr = nullptr; +} + +INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(DequantizeAWQ, &plan, &run, &cleanup); +} // namespace infinicore::op::dequantize_awq_impl::infiniop diff --git a/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc b/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc new file mode 100644 index 000000000..2b0255c4b --- /dev/null +++ b/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc @@ -0,0 +1,60 @@ +#include "infinicore/ops/linear_w4a16_awq.hpp" +#include "infinicore/ops/dequantize_awq.hpp" +#include "infinicore/ops/gemm.hpp" + +namespace infinicore::op { + +Tensor linear_w4a16_awq(Tensor input, + Tensor weight_packed, + Tensor weight_scale, + Tensor weight_zeros, + std::optional bias) { + + // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1] + Size ndim = input->ndim(); + Size out_features = weight_packed->shape()[0]; + + // Assign memory to out variables + auto output_shape = input->shape(); + output_shape[ndim - 1] = out_features; + auto out = Tensor::empty(output_shape, input->dtype(), input->device()); + + // Inplace Calculate + linear_w4a16_awq_(out, input, weight_packed, weight_scale, weight_zeros, bias); + return out; +} + +void linear_w4a16_awq_(Tensor out, + Tensor input, + Tensor weight_packed, + Tensor weight_scale, + Tensor weight_zeros, + std::optional bias) { + + auto weight_packed_shape = weight_packed->shape(); + Size out_features = weight_packed_shape[0]; + Size in_features = weight_packed_shape[1]; + + Size ndim = input->ndim(); + assert(out->ndim() == ndim); + + Size N = 1; + auto input_shape = input->shape(); + for (size_t i = 0; i < ndim - 1; ++i) { + N *= input_shape[i]; + } + + auto weight = Tensor::empty( + {out_features, in_features}, + out->dtype(), + weight_packed->device()); + float alpha = 1.0f; + float beta = 0.0f; + op::dequantize_awq_(weight, weight_packed, weight_scale, weight_zeros); + bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1})); + gemm_(out->view({N, out_features}), + input->view({N, in_features}), + weight->permute({1, 0}), alpha, beta); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc b/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc new file mode 100644 index 000000000..d69e0e7a2 --- /dev/null +++ b/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc @@ -0,0 +1,66 @@ +#include "infinicore/ops/linear_w8a8i8.hpp" +#include "infinicore/ops/per_channel_quant_i8.hpp" +#include "infinicore/ops/scaled_mm_i8.hpp" + +namespace infinicore::op { + +Tensor linear_w8a8i8(Tensor input, + Tensor weight_packed, + Tensor weight_scale, + std::optional bias) { + + // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1] + Size ndim = input->ndim(); + Size out_features = weight_packed->shape()[0]; + + // Assign memory to out variables + auto output_shape = input->shape(); + output_shape[ndim - 1] = out_features; + auto out = Tensor::empty(output_shape, input->dtype(), input->device()); + + // Inplace Calculate + linear_w8a8i8_(out, input, weight_packed, weight_scale, bias); + return out; +} + +void linear_w8a8i8_(Tensor out, + Tensor input, + Tensor weight_packed, + Tensor weight_scale, + std::optional bias) { + + auto weight_packed_shape = weight_packed->shape(); + Size out_features = weight_packed_shape[0]; + Size in_features = weight_packed_shape[1]; + + Size ndim = input->ndim(); + assert(out->ndim() == ndim); + + Size N = 1; + auto input_shape = input->shape(); + for (size_t i = 0; i < ndim - 1; ++i) { + N *= input_shape[i]; + } + + auto input_packed = Tensor::empty( + {N, input_shape[ndim - 1]}, + DataType::I8, + input->device()); + auto input_scale = Tensor::empty( + {N, 1}, + DataType::F32, + input->device()); + op::per_channel_quant_i8_(input->view({N, in_features}), input_packed, input_scale); + if (bias.has_value()) { + bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1})); + } + op::scaled_mm_i8_( + out->view({N, out_features}), + input_packed, + input_scale, + weight_packed->permute({1, 0}), + weight_scale, + bias); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc new file mode 100644 index 000000000..40ddefbfe --- /dev/null +++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc @@ -0,0 +1,20 @@ +#include "infinicore/ops/per_channel_quant_i8.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerChannelQuantI8); + +PerChannelQuantI8::PerChannelQuantI8(const Tensor &x, Tensor x_packed, Tensor x_scale) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale); + INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale); +} + +void PerChannelQuantI8::execute(const Tensor &x, Tensor x_packed, Tensor x_scale) { + INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerChannelQuantI8, x, x_packed, x_scale); +} + +void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale) { + PerChannelQuantI8::execute(x, x_packed, x_scale); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc new file mode 100644 index 000000000..569c9fdae --- /dev/null +++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc @@ -0,0 +1,56 @@ +#include "../../utils.hpp" +#include "../infiniop_impl.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/per_channel_quant_i8.hpp" +#include + +namespace infinicore::op::per_channel_quant_i8_impl::infiniop { + +INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerChannelQuantI8, 100); + +struct PlannedMeta { + std::shared_ptr descriptor; + graph::GraphTensor workspace, x, x_packed, x_scale; +}; + +void *plan(const Tensor &x, Tensor x_packed, Tensor x_scale) { + size_t seed = hash_combine(x, x_packed, x_scale); + + INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE( + Descriptor, descriptor, PerChannelQuantI8, + seed, + x_packed->desc(), x_scale->desc(), nullptr, x->desc()); + + INFINIOP_WORKSPACE_TENSOR(workspace, PerChannelQuantI8, descriptor); + + return new PlannedMeta{ + descriptor, + graph::GraphTensor(workspace), + graph::GraphTensor(x), + graph::GraphTensor(x_packed), + graph::GraphTensor(x_scale)}; +} + +void run(void *planned_meta) { + auto planned = reinterpret_cast(planned_meta); + + INFINICORE_CHECK_ERROR(infiniopPerChannelQuantI8( + planned->descriptor->desc, + planned->workspace->data(), + planned->workspace->numel(), + planned->x_packed->data(), + planned->x_scale->data(), + nullptr, + planned->x->data(), + context::getStream())); +} + +void cleanup(void **planned_meta_ptr) { + delete *reinterpret_cast(planned_meta_ptr); + *planned_meta_ptr = nullptr; +} + +INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerChannelQuantI8, &plan, &run, &cleanup); + +} // namespace infinicore::op::per_channel_quant_i8_impl::infiniop diff --git a/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc new file mode 100644 index 000000000..6d6a5f8ff --- /dev/null +++ b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc @@ -0,0 +1,21 @@ +#include "infinicore/ops/scaled_mm_i8.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(I8Gemm); + +I8Gemm::I8Gemm(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional bias) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a_p, a_s, b_p, b_s); + INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a_p, a_s, b_p, b_s, bias); +} +void I8Gemm::execute(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional bias) { + INFINICORE_GRAPH_OP_RECORD_OR_RUN(I8Gemm, c, a_p, a_s, b_p, b_s, bias); +} + +void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional bias) { + I8Gemm::execute(c, a_p, a_s, b_p, b_s, bias); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc new file mode 100644 index 000000000..952b570cc --- /dev/null +++ b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc @@ -0,0 +1,65 @@ +#include "../../utils.hpp" +#include "../infiniop_impl.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/scaled_mm_i8.hpp" +#include + +namespace infinicore::op::scaled_mm_i8_impl::infiniop { + +INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, I8Gemm, 100); + +struct PlannedMeta { + std::shared_ptr descriptor; + graph::GraphTensor workspace, c, a_p, a_s, b_p, b_s; + std::optional bias; +}; + +void *plan(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional bias) { + size_t seed = hash_combine(c, a_p, a_s, b_p, b_s); + + INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE( + Descriptor, descriptor, I8Gemm, + seed, + c->desc(), bias.has_value() ? bias.value()->desc() : nullptr, + a_p->desc(), a_s->desc(), b_p->desc(), b_s->desc()); + + INFINIOP_WORKSPACE_TENSOR(workspace, I8Gemm, descriptor); + + return new PlannedMeta{ + descriptor, + graph::GraphTensor(workspace), + graph::GraphTensor(c), + graph::GraphTensor(a_p), + graph::GraphTensor(a_s), + graph::GraphTensor(b_p), + graph::GraphTensor(b_s), + // bias.has_value() ? bias.value()->desc() : nullptr}; + bias ? std::optional(graph::GraphTensor(*bias)) : std::nullopt}; +} + +void run(void *planned_meta) { + auto planned = reinterpret_cast(planned_meta); + + INFINICORE_CHECK_ERROR(infiniopI8Gemm( + planned->descriptor->desc, + planned->workspace->data(), + planned->workspace->numel(), + planned->c->data(), + // planned->bias->data(), + planned->bias.has_value() ? planned->bias.value()->data() : nullptr, + planned->a_p->data(), + planned->a_s->data(), + planned->b_p->data(), + planned->b_s->data(), + context::getStream())); +} + +void cleanup(void **planned_meta_ptr) { + delete *reinterpret_cast(planned_meta_ptr); + *planned_meta_ptr = nullptr; +} + +INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(I8Gemm, &plan, &run, &cleanup); + +} // namespace infinicore::op::scaled_mm_i8_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index c7dcae6ca..fd3aaf3ff 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -10,6 +10,7 @@ #include "ops/flash_attention.hpp" #include "ops/kv_caching.hpp" #include "ops/linear.hpp" +#include "ops/linear_w8a8i8.hpp" #include "ops/matmul.hpp" #include "ops/mul.hpp" #include "ops/paged_attention.hpp" @@ -46,6 +47,7 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_linear_w8a8i8(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/linear_w8a8i8.hpp b/src/infinicore/pybind11/ops/linear_w8a8i8.hpp new file mode 100644 index 000000000..926d554b1 --- /dev/null +++ b/src/infinicore/pybind11/ops/linear_w8a8i8.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include + +#include "infinicore/ops/linear_w8a8i8.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +Tensor py_linear_w8a8i8(Tensor input, + Tensor weight_packed, + Tensor weight_scale, + pybind11::object bias) { + std::optional bias_tensor = std::nullopt; + if (!bias.is_none()) { + bias_tensor = bias.cast(); + } + return op::linear_w8a8i8(input, weight_packed, weight_scale, bias_tensor); +} + +void py_linear_w8a8i8_(Tensor out, + Tensor input, + Tensor weight_packed, + Tensor weight_scale, + pybind11::object bias) { + + std::optional bias_tensor = std::nullopt; + if (!bias.is_none()) { + bias_tensor = bias.cast(); + } + + op::linear_w8a8i8_(out, input, weight_packed, weight_scale, bias_tensor); +} + +inline void bind_linear_w8a8i8(py::module &m) { + m.def("linear_w8a8i8", + &ops::py_linear_w8a8i8, + py::arg("input"), + py::arg("weight_packed"), + py::arg("weight_scale"), + py::arg("bias") = py::none(), + R"doc(linear_w8a8i8.)doc"); + m.def("linear_w8a8i8_", + &ops::py_linear_w8a8i8_, + py::arg("out"), + py::arg("input"), + py::arg("weight_packed"), + py::arg("weight_scale"), + py::arg("bias") = py::none(), + R"doc(linear_w8a8i8_.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp b/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp new file mode 100644 index 000000000..da6f9f592 --- /dev/null +++ b/src/infinicore/pybind11/ops/per_channel_quant_i8.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include "infinicore/ops/per_channel_quant_i8.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_per_channel_quant_i8(py::module &m) { + m.def("per_channel_quant_i8_", + &op::per_channel_quant_i8_, + py::arg("x"), + py::arg("x_packed"), + py::arg("x_scale"), + R"doc(Per-channel quantization of a tensor.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/scaled_mm_i8.hpp b/src/infinicore/pybind11/ops/scaled_mm_i8.hpp new file mode 100644 index 000000000..c3d46d9df --- /dev/null +++ b/src/infinicore/pybind11/ops/scaled_mm_i8.hpp @@ -0,0 +1,30 @@ +#pragma once + +#include + +#include "infinicore/ops/scaled_mm_i8.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_scaled_mm_i8(py::module &m) { + m.def("scaled_mm_i8", + &op::scaled_mm_i8, + py::arg("a_p"), + py::arg("a_s"), + py::arg("b_p"), + py::arg("b_s"), + py::arg("bias"), + R"doc(Scaled matrix multiplication of two tensors.)doc"); + + m.def("scaled_mm_i8_", + &op::scaled_mm_i8_, + py::arg("a"), + py::arg("b"), + py::arg("a_scale"), + py::arg("b_scale"), + R"doc(In-place Scaled matrix multiplication of two tensors.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/tensor/debug.cc b/src/infinicore/tensor/debug.cc index 0ae1946e3..b57b00a52 100644 --- a/src/infinicore/tensor/debug.cc +++ b/src/infinicore/tensor/debug.cc @@ -95,6 +95,20 @@ void print_data_bf16(const uint16_t *data, const Shape &shape, const Strides &st } } +// Function for printing I8 data +void print_data_i8(const int8_t *data, const Shape &shape, const Strides &strides, size_t dim) { + if (dim == shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + std::cout << static_cast(data[i * strides[dim]]) << " "; + } + std::cout << std::endl; + } else if (dim < shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + print_data_i8(data + i * strides[dim], shape, strides, dim + 1); + } + } +} + // Template function for writing data recursively to binary file (handles non-contiguous tensors) template void write_binary_data(std::ofstream &out, const T *data, const Shape &shape, const Strides &strides, size_t dim) { @@ -181,8 +195,8 @@ void TensorImpl::debug(const std::string &filename) const { cpu_tensor->shape(), cpu_tensor->strides(), 0); break; case DataType::I8: - print_data(reinterpret_cast(cpu_data), - cpu_tensor->shape(), cpu_tensor->strides(), 0); + print_data_i8(reinterpret_cast(cpu_data), + cpu_tensor->shape(), cpu_tensor->strides(), 0); break; case DataType::BF16: print_data_bf16(reinterpret_cast(cpu_data), diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh new file mode 100644 index 000000000..3c014de9b --- /dev/null +++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh @@ -0,0 +1,273 @@ +#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__ +#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__ + +#include +__device__ inline int round_half_away_from_zero(float x) { + float ax = fabsf(x); + float r = floorf(ax + 0.5f); + return (x >= 0.0f) ? (int)r : -(int)r; +} + +template +__device__ void blockPerChannelQuantI8Kernel( + int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, + int M, int K) { + int row = blockIdx.x; + int tid = row * K; + + // ---- 1. reduce max ---- + float local_max = op::common_cuda::reduce_op::max( + x + tid, K); + + __shared__ float global_max_f; + if (threadIdx.x == 0) { + global_max_f = local_max; + } + __syncthreads(); + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // ---- 2. reduce min ---- + float thread_min = __FLT_MAX__; + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + thread_min = fminf(thread_min, (float)x[tid + ind]); + } +#if CUDART_VERSION >= 12090 + float local_min = BlockReduce(temp_storage).Reduce(thread_min, ::cuda::minimum()); +#else + float local_min = BlockReduce(temp_storage).Reduce(thread_min, cub::Min()); +#endif + __shared__ float global_min_f; + if (threadIdx.x == 0) { + global_min_f = local_min; + } + __syncthreads(); + + float global_max = global_max_f; + float global_min = global_min_f; + + float scale = (global_max - global_min) / 255.0f; + if (scale < 1e-8f) { + scale = 1e-8f; + } + + float inv_scale = 1.0f / scale; + float zero = -global_min * inv_scale - 128.0f; + + x_scale[row] = (Tdata)scale; + x_zero[row] = (Tdata)zero; + + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + + float v = (float)x[tid + ind]; + float qf = v * inv_scale + zero; + + int q = round_half_away_from_zero(qf); + + if (q > 127) { + q = 127; + } + if (q < -128) { + q = -128; + } + + x_packed[tid + ind] = (int8_t)q; + } +} + +template +__device__ void blockPerChannelQuantI8SymKernel( + int8_t *x_packed, float *x_scale, const Tdata *x, + int M, int K) { + int row = blockIdx.x; + int tid = row * K; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // ---- 2. reduce min ---- + float thread_max = -__FLT_MAX__; + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + thread_max = fmaxf(thread_max, fabs((float)x[tid + ind])); + } +#if CUDART_VERSION >= 12090 + float local_max = BlockReduce(temp_storage).Reduce(thread_max, ::cuda::maximum()); +#else + float local_max = BlockReduce(temp_storage).Reduce(thread_max, cub::Max()); +#endif + __shared__ float global_max_f; + if (threadIdx.x == 0) { + global_max_f = local_max; + } + __syncthreads(); + + float global_max = global_max_f; + + float scale = global_max / 127.0f; + if (scale < 1e-8f) { + scale = 1e-8f; + } + + float inv_scale = 1.0f / scale; + + x_scale[row] = (Tdata)scale; + + for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) { + + float v = (float)x[tid + ind]; + float qf = v * inv_scale; + + int q = round_half_away_from_zero(qf); + + if (q > 127) { + q = 127; + } + if (q < -127) { + q = -127; + } + + x_packed[tid + ind] = (int8_t)q; + } +} + +template +struct MaxOp { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return max(a, b); + } +}; +template +struct MinOp { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return min(a, b); + } +}; +template