-
Notifications
You must be signed in to change notification settings - Fork 101
Support Quantization #996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support Quantization #996
Changes from all commits
45cfbeb
70b92ce
cbd0834
74c32c6
f50155f
47c7cfe
f5076c3
7eb78d1
581d4a4
c97c25f
2edb912
753fda0
2f0a0aa
0673da6
bf529d6
f09a805
db7b7b7
9e20d3c
e50af2f
75984cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,7 @@ | ||
| [submodule "third_party/spdlog"] | ||
| path = third_party/spdlog | ||
| url = https://github.com/gabime/spdlog.git | ||
| [submodule "third_party/nlohmann_json"] | ||
| path = third_party/nlohmann_json | ||
| url = https://github.com/nlohmann/json.git | ||
| branch = master |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| #pragma once | ||
| #include "../device.hpp" | ||
| #include "common/op.hpp" | ||
| #include <optional> | ||
|
|
||
| namespace infinicore::op { | ||
| INFINICORE_GRAPH_OP_CLASS(DequantizeAWQ, Tensor, const Tensor &, const Tensor &, const Tensor &); | ||
|
|
||
| void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros); | ||
| } // namespace infinicore::op |
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 要不要支持图? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| #pragma once | ||
|
|
||
| #include "common/op.hpp" | ||
| #include <optional> | ||
|
|
||
| namespace infinicore::op { | ||
|
|
||
| Tensor linear_w4a16_awq(Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias); | ||
|
|
||
| void linear_w4a16_awq_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias); | ||
|
|
||
| } // namespace infinicore::op |
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 要不要支持图? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| #pragma once | ||
|
|
||
| #include "../graph/graph.hpp" | ||
| #include "common/op.hpp" | ||
| #include <optional> | ||
|
|
||
| namespace infinicore::op { | ||
|
|
||
| Tensor linear_w8a8i8(Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias); | ||
|
|
||
| void linear_w8a8i8_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias); | ||
|
|
||
| } // namespace infinicore::op |
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 要不要支持图? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| #pragma once | ||
| #include "../device.hpp" | ||
| #include "../graph/graph.hpp" | ||
| #include "common/op.hpp" | ||
| #include <optional> | ||
|
|
||
| namespace infinicore::op { | ||
|
|
||
| INFINICORE_GRAPH_OP_CLASS(PerChannelQuantI8, const Tensor &, Tensor, Tensor); | ||
|
|
||
| void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale); | ||
| } // namespace infinicore::op |
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 要不要支持图? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| #pragma once | ||
|
|
||
| #include "../device.hpp" | ||
| #include "../graph/graph.hpp" | ||
| #include "common/op.hpp" | ||
| #include <optional> | ||
|
|
||
| namespace infinicore::op { | ||
|
|
||
| INFINICORE_GRAPH_OP_CLASS(I8Gemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, std::optional<Tensor>); | ||
|
|
||
| void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias); | ||
| } // namespace infinicore::op |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| #pragma once | ||
|
|
||
| #include "quantization/awq.hpp" | ||
| #include "quantization/base_quantization.hpp" | ||
| #include "quantization/compressed_tensors.hpp" | ||
| #include "quantization/none_quantizaiton.hpp" | ||
| #include "quantization/quantization_scheme.hpp" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| #pragma once | ||
| #include "base_quantization.hpp" | ||
| namespace infinicore::quantization { | ||
|
|
||
| class AWQ : public BaseQuantization { | ||
| // This is a temporary class that currently only returns AWQ_W4A16. | ||
| // Future enhancements should parse quant_config to extract detailed quantization | ||
| // information and support multiple quantization schemes. | ||
| public: | ||
| explicit AWQ(const nlohmann::json &quant_config) | ||
| : BaseQuantization(quant_config) {}; | ||
|
|
||
| infinicore::quantization::QuantScheme | ||
| get_quant_scheme() const override { | ||
| return infinicore::quantization::QuantScheme::AWQ_W4A16; | ||
| }; | ||
| }; | ||
|
|
||
| } // namespace infinicore::quantization |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| #pragma once | ||
| #include "nlohmann/json.hpp" | ||
| #include "quantization_scheme.hpp" | ||
|
|
||
| namespace infinicore::quantization { | ||
| class BaseQuantization { | ||
| // Base class for quantization schemes. Intended to be extended to support various quantization methods. | ||
| public: | ||
| explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; | ||
| virtual ~BaseQuantization() = default; | ||
|
|
||
| virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0; | ||
|
|
||
| protected: | ||
| nlohmann::json quant_config_; | ||
| }; | ||
| } // namespace infinicore::quantization |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| #pragma once | ||
|
|
||
| #include "base_quantization.hpp" | ||
| namespace infinicore::quantization { | ||
|
|
||
| class CompressedTensors : public BaseQuantization { | ||
| // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8. | ||
| // Future enhancements should parse quant_config to extract detailed quantization | ||
| // information and support multiple quantization schemes. | ||
| public: | ||
| explicit CompressedTensors(const nlohmann::json &quant_config) | ||
| : BaseQuantization(quant_config) {}; | ||
|
|
||
| infinicore::quantization::QuantScheme | ||
| get_quant_scheme() const override { | ||
| return infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8; | ||
| }; | ||
| }; | ||
|
|
||
| } // namespace infinicore::quantization |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| #pragma once | ||
|
|
||
| #include "base_quantization.hpp" | ||
| namespace infinicore::quantization { | ||
|
|
||
| class NoneQuantization : public BaseQuantization { | ||
| // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8. | ||
| // Future enhancements should parse quant_config to extract detailed quantization | ||
| // information and support multiple quantization schemes. | ||
| public: | ||
| explicit NoneQuantization(const nlohmann::json &quant_config) | ||
| : BaseQuantization(quant_config) {}; | ||
|
|
||
| infinicore::quantization::QuantScheme | ||
| get_quant_scheme() const override { | ||
| return infinicore::quantization::QuantScheme::NONE; | ||
| }; | ||
| }; | ||
|
|
||
| } // namespace infinicore::quantization |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| // quant.hpp | ||
| #pragma once | ||
|
|
||
| namespace infinicore::quantization { | ||
|
|
||
| enum class QuantScheme { | ||
| NONE, | ||
| COMPRESSED_TENSOR_W8A8I8, | ||
| AWQ_W4A16, | ||
| }; | ||
|
|
||
| } // namespace infinicore::quantization |
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么它要住单人间?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 之后会补充其他的量化算法,准备都统一放到quant下面 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| #ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__ | ||
| #define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__ | ||
|
|
||
| #include "../../operator_descriptor.h" | ||
|
|
||
| typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t; | ||
|
|
||
| __C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle, | ||
| infiniopPerChannelQuantI8Descriptor_t *desc_ptr, | ||
| infiniopTensorDescriptor_t x_packed_desc, | ||
| infiniopTensorDescriptor_t x_scale_desc, | ||
| infiniopTensorDescriptor_t x_zero_desc, | ||
| infiniopTensorDescriptor_t x_desc); | ||
|
|
||
| __C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size); | ||
|
|
||
| __C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc, | ||
| void *workspace, | ||
| size_t workspace_size, | ||
| void *x_packed, | ||
| void *x_scale, | ||
| void *x_zero, | ||
| const void *x, | ||
| void *stream); | ||
|
|
||
| __C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc); | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| from infinicore.lib import _infinicore | ||
| from infinicore.tensor import Tensor | ||
|
|
||
|
|
||
| def linear_w8a8i8( | ||
| input: Tensor, | ||
| weight_packed: Tensor, | ||
| weight_scale: Tensor, | ||
| bias=None, | ||
| out=None, | ||
| ) -> Tensor: | ||
| r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale.""" | ||
|
|
||
| if out is None: | ||
| return Tensor( | ||
| _infinicore.linear_w8a8i8( | ||
| input._underlying, | ||
| weight_packed._underlying, | ||
| weight_scale._underlying, | ||
| None if bias is None else bias._underlying, | ||
| ) | ||
| ) | ||
|
|
||
| _infinicore.linear_w8a8i8_( | ||
| out._underlying, | ||
| input._underlying, | ||
| weight_packed._underlying, | ||
| weight_scale._underlying, | ||
| None if bias is None else bias._underlying, | ||
| ) | ||
| return out |
Uh oh!
There was an error while loading. Please reload this page.