Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[submodule "third_party/spdlog"]
path = third_party/spdlog
url = https://github.com/gabime/spdlog.git
[submodule "third_party/nlohmann_json"]
path = third_party/nlohmann_json
url = https://github.com/nlohmann/json.git
branch = master
1 change: 1 addition & 0 deletions include/infinicore.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
#include "infinicore/device_event.hpp"
#include "infinicore/nn.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/quantization.hpp"
#include "infinicore/tensor.hpp"
22 changes: 22 additions & 0 deletions include/infinicore/nn/linear.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#pragma once

#include "../ops.hpp"
#include "../quantization.hpp"
#include "module.hpp"
#include <infiniccl.h>
#include <optional>

namespace infinicore::nn {

Expand All @@ -11,6 +13,9 @@ class BaseLinear : public Module {
BaseLinear(size_t in_features, size_t out_features, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());

BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());

// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;

Expand All @@ -27,12 +32,17 @@ class BaseLinear : public Module {
// Accessors for parameters
Tensor weight() const { return weight_; }
Tensor bias() const { return bias_; }
Tensor weight_scale() const { return weight_scale_; }
Tensor weight_zeros() const { return weight_zeros_; }

protected:
// Parameters
INFINICORE_NN_PARAMETER(weight);
INFINICORE_NN_PARAMETER(bias);

INFINICORE_NN_PARAMETER(weight_scale);
INFINICORE_NN_PARAMETER(weight_zeros);

protected:
// Helper method for common forward computation
Tensor compute_linear(Tensor &input) const;
Expand All @@ -41,6 +51,7 @@ class BaseLinear : public Module {
size_t out_features_;
bool has_bias_;
DataType dtype_;
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_ = std::make_shared<infinicore::quantization::NoneQuantization>(nullptr);
};

} // namespace infinicore::nn
Expand All @@ -52,6 +63,9 @@ class Linear : public BaseLinear {
Linear(size_t in_features, size_t out_features, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());

Linear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device());

// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;

Expand All @@ -65,6 +79,10 @@ class ColumnParallelLinear : public BaseLinear {
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1);

ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1);

// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;

Expand All @@ -82,6 +100,10 @@ class RowParallelLinear : public BaseLinear {
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);

RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
const DataType &dtype = DataType::F32, const Device &device = Device(),
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);

// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;

Expand Down
10 changes: 10 additions & 0 deletions include/infinicore/ops/dequantize_awq.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once
#include "../device.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(DequantizeAWQ, Tensor, const Tensor &, const Tensor &, const Tensor &);

void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros);
} // namespace infinicore::op
12 changes: 12 additions & 0 deletions include/infinicore/ops/linear_w4a16_awq.hpp
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

要不要支持图?

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

Tensor linear_w4a16_awq(Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias);

void linear_w4a16_awq_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, Tensor weight_zeros, std::optional<Tensor> bias);

} // namespace infinicore::op
13 changes: 13 additions & 0 deletions include/infinicore/ops/linear_w8a8i8.hpp
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

要不要支持图?

Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

Tensor linear_w8a8i8(Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias);

void linear_w8a8i8_(Tensor out, Tensor input, Tensor weight_packed, Tensor weight_scale, std::optional<Tensor> bias);

} // namespace infinicore::op
12 changes: 12 additions & 0 deletions include/infinicore/ops/per_channel_quant_i8.hpp
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

要不要支持图?

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(PerChannelQuantI8, const Tensor &, Tensor, Tensor);

void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale);
} // namespace infinicore::op
13 changes: 13 additions & 0 deletions include/infinicore/ops/scaled_mm_i8.hpp
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

要不要支持图?

Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(I8Gemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, std::optional<Tensor>);

void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias);
} // namespace infinicore::op
7 changes: 7 additions & 0 deletions include/infinicore/quantization.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#pragma once

#include "quantization/awq.hpp"
#include "quantization/base_quantization.hpp"
#include "quantization/compressed_tensors.hpp"
#include "quantization/none_quantizaiton.hpp"
#include "quantization/quantization_scheme.hpp"
19 changes: 19 additions & 0 deletions include/infinicore/quantization/awq.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once
#include "base_quantization.hpp"
namespace infinicore::quantization {

class AWQ : public BaseQuantization {
// This is a temporary class that currently only returns AWQ_W4A16.
// Future enhancements should parse quant_config to extract detailed quantization
// information and support multiple quantization schemes.
public:
explicit AWQ(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};

infinicore::quantization::QuantScheme
get_quant_scheme() const override {
return infinicore::quantization::QuantScheme::AWQ_W4A16;
};
};

} // namespace infinicore::quantization
17 changes: 17 additions & 0 deletions include/infinicore/quantization/base_quantization.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once
#include "nlohmann/json.hpp"
#include "quantization_scheme.hpp"

namespace infinicore::quantization {
class BaseQuantization {
// Base class for quantization schemes. Intended to be extended to support various quantization methods.
public:
explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
virtual ~BaseQuantization() = default;

virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0;

protected:
nlohmann::json quant_config_;
};
} // namespace infinicore::quantization
20 changes: 20 additions & 0 deletions include/infinicore/quantization/compressed_tensors.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include "base_quantization.hpp"
namespace infinicore::quantization {

class CompressedTensors : public BaseQuantization {
// This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
// Future enhancements should parse quant_config to extract detailed quantization
// information and support multiple quantization schemes.
public:
explicit CompressedTensors(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};

infinicore::quantization::QuantScheme
get_quant_scheme() const override {
return infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8;
};
};

} // namespace infinicore::quantization
20 changes: 20 additions & 0 deletions include/infinicore/quantization/none_quantizaiton.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include "base_quantization.hpp"
namespace infinicore::quantization {

class NoneQuantization : public BaseQuantization {
// This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
// Future enhancements should parse quant_config to extract detailed quantization
// information and support multiple quantization schemes.
public:
explicit NoneQuantization(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};

infinicore::quantization::QuantScheme
get_quant_scheme() const override {
return infinicore::quantization::QuantScheme::NONE;
};
};

} // namespace infinicore::quantization
12 changes: 12 additions & 0 deletions include/infinicore/quantization/quantization_scheme.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// quant.hpp
#pragma once

namespace infinicore::quantization {

enum class QuantScheme {
NONE,
COMPRESSED_TENSOR_W8A8I8,
AWQ_W4A16,
};

} // namespace infinicore::quantization
2 changes: 2 additions & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "infiniop/ops/flash_attention.h"
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/int8_gemm.h"
#include "infiniop/ops/kv_caching.h"
#include "infiniop/ops/layer_norm.h"
#include "infiniop/ops/logsoftmax.h"
Expand All @@ -22,6 +23,7 @@
#include "infiniop/ops/paged_attention.h"
#include "infiniop/ops/paged_attention_prefill.h"
#include "infiniop/ops/paged_caching.h"
#include "infiniop/ops/quant/per_channel_quant_int8.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
#include "infiniop/ops/relu.h"
Expand Down
28 changes: 28 additions & 0 deletions include/infiniop/ops/quant/per_channel_quant_int8.h
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么它要住单人间?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

之后会补充其他的量化算法,准备都统一放到quant下面

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__

#include "../../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;

__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);

__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);

#endif
2 changes: 2 additions & 0 deletions python/infinicore/nn/functional/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .embedding import embedding
from .flash_attention import flash_attention
from .linear import linear
from .linear_w8a8i8 import linear_w8a8i8
from .random_sample import random_sample
from .rms_norm import rms_norm
from .rope import RopeAlgo, rope
Expand All @@ -19,4 +20,5 @@
"rope",
"silu",
"swiglu",
"linear_w8a8i8",
]
31 changes: 31 additions & 0 deletions python/infinicore/nn/functional/linear_w8a8i8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor


def linear_w8a8i8(
input: Tensor,
weight_packed: Tensor,
weight_scale: Tensor,
bias=None,
out=None,
) -> Tensor:
r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""

if out is None:
return Tensor(
_infinicore.linear_w8a8i8(
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
)

_infinicore.linear_w8a8i8_(
out._underlying,
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
return out
Loading