From 0c9cf425d2d2a49c4d9d3c63d8b22ca8dc76d11b Mon Sep 17 00:00:00 2001 From: alanzhai219 Date: Mon, 11 May 2026 16:32:23 +0800 Subject: [PATCH 1/2] x86: add descriptions 1. add readme 2. add test_abs --- README.md | 123 ++++++++++++++++++++ cmake/x86.cmake | 47 ++++---- test/test_abs.cpp | 287 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 431 insertions(+), 26 deletions(-) create mode 100644 README.md create mode 100644 test/test_abs.cpp diff --git a/README.md b/README.md new file mode 100644 index 0000000..680714e --- /dev/null +++ b/README.md @@ -0,0 +1,123 @@ +# ppl.kernel.cpu + +`ppl.kernel.cpu` 是 OpenPPL 的 CPU kernel 仓库,包含 x86、ARM、RISC-V 等后端的算子实现、头文件和测试程序。 + +常用目录: + +- `include/`: 对外头文件 +- `src/`: kernel 实现 +- `test/`: 测试与 benchmark +- `cmake/`: 各架构构建脚本 + +## 编译 + +x86 默认不会自动打开,推荐使用单独构建目录: + +```bash +cd //ppl.kernel.cpu +mkdir -p build +cd build +cmake -DPPLNN_USE_X86_64=ON -DPPLNN_BUILD_TESTS=ON -DPPLNNX86Kernel_HOLD_DEPS=ON .. +cmake --build . -j4 +``` + +只编译某个测试目标: + +```bash +cmake --build . --target test_abs -j4 +``` + +常用开关: + +- `PPLNN_USE_X86_64=ON`: 打开 x86 kernel +- `PPLNN_BUILD_TESTS=ON`: 生成测试目标 +- `PPLNNX86Kernel_HOLD_DEPS=ON`: 冻结当前依赖 +- `PPLNN_USE_OPENMP=ON`: 打开 OpenMP,多线程 benchmark 需要它 + +## 如何新加算子 + +以 x86 fp32 为例,推荐沿用这组结构: + +- `include/ppl/kernel/x86/fp32/my_op.h` +- `src/ppl/kernel/x86/fp32/my_op/my_op_fp32.cpp` +- `src/ppl/kernel/x86/fp32/my_op/my_op_fp32_sse.cpp` +- `src/ppl/kernel/x86/fp32/my_op/my_op_fp32_avx.cpp` +- `test/test_my_op.cpp` + +约定: + +- `*_fp32.cpp`: 参考实现和 ISA 分发 +- `*_sse.cpp` / `*_avx.cpp` / `*_fma.cpp` / `*_avx512.cpp`: ISA 特化实现 +- 统一入口先判断 `isa`,再调用对应实现,否则退回 `ref` + +x86 源文件在 `cmake/x86.cmake` 中通过 `GLOB_RECURSE` 自动收集,通常不需要手动登记源码文件。 + +## 如何测试算子 + +当前仓库是“一个测试目标对应一个可执行文件”,不是 gtest 风格。 + +新增测试时: + +1. 在 `test/` 下增加 `test_my_op.cpp` +2. 在 `cmake/x86.cmake` 的 `PPLKERNELX86_TESTS` 里追加 `test_my_op` +3. 编译并运行 + +```bash +cd //ppl.kernel.cpu/build +cmake --build . --target test_my_op -j4 +./test_my_op +``` + +注意:`test/` 里的程序使用 `simple_flags`,参数是单横线形式,例如 `-isa=avx`。 + +`test_abs` 的 correctness 用法: + +```bash +./test_abs +./test_abs -isa=noarch +./test_abs -isa=sse +./test_abs -isa=avx +``` + +## 如何测试性能 + +现有 `test_gemm`、`test_conv2d`、`test_pd_conv2d` 本身就带 benchmark 参数,例如: + +```bash +./test_gemm -cfg=/path/to/gemm.cfg -isa=fma -warm_up=5 -min_iter=50 -min_second=2 +./test_conv2d -cfg=/path/to/conv.cfg -algo=n16cx_gemm_direct_fp32_fma -warm_up=5 -min_iter=20 -min_second=2 +``` + +`test_abs` 现在也支持 benchmark 模式: + +```bash +./test_abs -benchmark=true -validate=false -isa=avx -len=16777216 -warm_up=5 -min_iter=50 -min_second=2 -num_threads=1 +``` + +如果构建时打开了 OpenMP,还可以控制线程数和绑核: + +```bash +./test_abs -benchmark=true -validate=false -isa=avx -len=16777216 -warm_up=5 -min_iter=50 -min_second=2 -num_threads=8 -core_bind=true +``` + +`test_abs` benchmark 常用参数: + +- `-benchmark`: 打开 benchmark 模式 +- `-validate`: benchmark 前先做一次正确性校验 +- `-isa`: `noarch`、`sse`、`avx` 或 `auto` +- `-len`: 输入元素个数 +- `-warm_up`: 预热次数 +- `-min_iter`: 最少迭代次数 +- `-min_second`: 最短测试时长 +- `-num_threads`: OpenMP 线程数 +- `-core_bind`: 是否绑核 + +输出字段包括 `min_ms`、`avg_ms`、`max_gbps`、`avg_gbps` 和 `num_threads`。 + +对 `abs` 这类简单 unary 算子,更建议看 `GB/s` 而不是 `GFLOPS`。 + +如需补充底层性能指标,可以直接配合 `perf stat`: + +```bash +perf stat ./test_abs -isa=avx +``` diff --git a/cmake/x86.cmake b/cmake/x86.cmake index adee543..beb1ecb 100644 --- a/cmake/x86.cmake +++ b/cmake/x86.cmake @@ -113,32 +113,27 @@ endif() if(PPLNN_BUILD_TESTS) set(__PPLNN_TOOLS_DIR__ ${CMAKE_CURRENT_SOURCE_DIR}/test) - add_executable(test_conv2d test/test_conv2d.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc) - target_include_directories(test_conv2d - PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES} - PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES}) - target_compile_options(test_conv2d PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS}) - target_compile_definitions(test_conv2d PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS}) - target_compile_features(test_conv2d PRIVATE cxx_std_11) - target_link_libraries(test_conv2d PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES}) - - add_executable(test_gemm test/test_gemm.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc) - target_include_directories(test_gemm - PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES} ${PPLKERNELX86_INCLUDE_DIRECTORIES} - PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES}) - target_compile_options(test_gemm PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS}) - target_compile_definitions(test_gemm PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS}) - target_compile_features(test_gemm PRIVATE cxx_std_11) - target_link_libraries(test_gemm PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES}) - - add_executable(test_pd_conv2d test/test_pd_conv2d.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc) - target_include_directories(test_pd_conv2d - PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES} ${PPLKERNELX86_INCLUDE_DIRECTORIES} - PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES}) - target_compile_options(test_pd_conv2d PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS}) - target_compile_definitions(test_pd_conv2d PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS}) - target_compile_features(test_pd_conv2d PRIVATE cxx_std_11) - target_link_libraries(test_pd_conv2d PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES}) + function(pplkernelx86_add_test target_name) + add_executable(${target_name} test/${target_name}.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc) + target_include_directories(${target_name} + PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES} + PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES}) + target_compile_options(${target_name} PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS}) + target_compile_definitions(${target_name} PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS}) + target_compile_features(${target_name} PRIVATE cxx_std_11) + target_link_libraries(${target_name} PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES}) + endfunction() + + set(PPLKERNELX86_TESTS) + list(APPEND PPLKERNELX86_TESTS + test_abs + test_conv2d + test_gemm + test_pd_conv2d) + + foreach(test_name IN LISTS PPLKERNELX86_TESTS) + pplkernelx86_add_test(${test_name}) + endforeach() unset(__PPLNN_TOOLS_DIR__) endif() diff --git a/test/test_abs.cpp b/test/test_abs.cpp new file mode 100644 index 0000000..4a01c1d --- /dev/null +++ b/test/test_abs.cpp @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) && defined(PPL_USE_X86_OMP) +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#endif + +#include "ppl/kernel/x86/fp32/abs.h" +#include "ppl/common/tensor_shape.h" +#include "simple_flags.h" + +Define_bool_opt("--help", Flag_help, false, "show these help information"); +Define_string(isa, "auto", "(auto) noarch, sse, avx, auto"); +Define_bool(benchmark, false, "(false) run benchmark instead of correctness-only test"); +Define_bool(validate, true, "(true) validate output before benchmark loop"); +Define_int64(len, 16 * 1024 * 1024, "(16777216) benchmark tensor length"); +Define_int32(warm_up, 5, "(5) benchmark warm up iterations"); +Define_int32(min_iter, 50, "(50) minimum benchmark iterations"); +Define_float(min_second, 1.0f, "(1.0) minimum benchmark duration in seconds"); +Define_int32(seed, 20260511, "(20260511) random seed for benchmark input"); +Define_int32(num_threads, 1, "(1) number of threads for benchmark when OpenMP is enabled"); +Define_bool(core_bind, false, "(false) bind OpenMP worker threads to cores when OpenMP is enabled"); + +static bool equal_value(const float dst, const float ref) +{ + if (std::isnan(dst) || std::isnan(ref)) { + return std::isnan(dst) && std::isnan(ref); + } + if (std::isinf(dst) || std::isinf(ref)) { + return dst == ref; + } + if (dst == 0.0f && ref == 0.0f) { + return std::signbit(dst) == std::signbit(ref); + } + return dst == ref; +} + +static bool test_one_case( + const std::vector &src, + const ppl::common::isa_t isa, + const std::string &isa_name, + const std::string &case_name) +{ + ppl::common::TensorShape shape; + shape.Reshape({(int64_t)src.size()}); + + std::vector dst(src.size(), 0.0f); + std::vector ref(src.size(), 0.0f); + + auto rc_ref = ppl::kernel::x86::abs_fp32_ref(&shape, src.data(), ref.data()); + auto rc = ppl::kernel::x86::abs_fp32(isa, &shape, src.data(), dst.data()); + if (rc_ref != ppl::common::RC_SUCCESS || rc != ppl::common::RC_SUCCESS) { + std::cerr << "run failed, isa=" << isa_name << " case=" << case_name << "\n"; + return false; + } + + for (size_t i = 0; i < src.size(); ++i) { + if (!equal_value(dst[i], ref[i])) { + std::cerr << "mismatch, isa=" << isa_name + << ", case=" << case_name + << ", idx=" << i + << ", src=" << src[i] + << ", dst=" << dst[i] + << ", ref=" << ref[i] << "\n"; + return false; + } + } + return true; +} + +static bool test_corner_cases(const ppl::common::isa_t isa, const std::string &isa_name) +{ + const float inf = std::numeric_limits::infinity(); + const std::vector> cases = { + {-0.0f}, + {0.0f}, + {-1.0f}, + {1.0f}, + {-0.0f, 0.0f, -1.0f, 1.0f}, + {-inf, inf, -123.5f, 456.25f}, + {-1.0f, 2.0f, -3.0f, 4.0f, -5.0f, 6.0f, -7.0f, 8.0f, + -9.0f, 10.0f, -11.0f, 12.0f, -13.0f, 14.0f, -15.0f, 16.0f, -17.0f} + }; + + for (size_t i = 0; i < cases.size(); ++i) { + if (!test_one_case(cases[i], isa, isa_name, "corner_" + std::to_string(i))) { + return false; + } + } + return true; +} + +static bool test_random_cases(const ppl::common::isa_t isa, const std::string &isa_name) +{ + std::mt19937 rng(20260511); + std::uniform_real_distribution dist(-1000.0f, 1000.0f); + const std::vector lens = {1, 2, 3, 4, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65}; + + for (size_t case_idx = 0; case_idx < lens.size(); ++case_idx) { + const int64_t len = lens[case_idx]; + std::vector src(len); + for (int64_t i = 0; i < len; ++i) { + src[i] = dist(rng); + } + if (!test_one_case(src, isa, isa_name, "random_" + std::to_string(len))) { + return false; + } + } + return true; +} + +static bool benchmark_abs(const ppl::common::isa_t isa, const std::string &isa_name) +{ + if (Flag_len <= 0) { + std::cerr << "invalid len: " << Flag_len << "\n"; + return false; + } + if (Flag_num_threads <= 0) { + std::cerr << "invalid num_threads: " << Flag_num_threads << "\n"; + return false; + } + + int32_t num_threads = 1; +#if defined(__linux__) && defined(PPL_USE_X86_OMP) + omp_set_num_threads(Flag_num_threads); + num_threads = Flag_num_threads; + if (Flag_core_bind) { +#pragma omp parallel + { +#define handle_error_en(en, msg) do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0) + int i = omp_get_thread_num(); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(i, &cpuset); + if (int s = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset) != 0) { + handle_error_en(s, "pthread_setaffinity_np"); + } +#undef handle_error_en + } + } +#else + if (Flag_num_threads != 1 || Flag_core_bind) { + std::cerr << "warning: thread control requires OpenMP build support; using single-thread benchmark\n"; + } +#endif + + ppl::common::TensorShape shape; + shape.Reshape({Flag_len}); + + std::mt19937 rng(Flag_seed); + std::uniform_real_distribution dist(-1000.0f, 1000.0f); + std::vector src(Flag_len); + std::vector dst(Flag_len, 0.0f); + std::vector ref; + for (int64_t i = 0; i < Flag_len; ++i) { + src[i] = dist(rng); + } + + if (Flag_validate) { + ref.resize(Flag_len, 0.0f); + auto rc_ref = ppl::kernel::x86::abs_fp32_ref(&shape, src.data(), ref.data()); + auto rc = ppl::kernel::x86::abs_fp32(isa, &shape, src.data(), dst.data()); + if (rc_ref != ppl::common::RC_SUCCESS || rc != ppl::common::RC_SUCCESS) { + std::cerr << "benchmark validation run failed, isa=" << isa_name << "\n"; + return false; + } + for (int64_t i = 0; i < Flag_len; ++i) { + if (!equal_value(dst[i], ref[i])) { + std::cerr << "benchmark validation mismatch, isa=" << isa_name + << ", idx=" << i + << ", src=" << src[i] + << ", dst=" << dst[i] + << ", ref=" << ref[i] << "\n"; + return false; + } + } + } + + for (int32_t i = 0; i < Flag_warm_up; ++i) { + auto rc = ppl::kernel::x86::abs_fp32(isa, &shape, src.data(), dst.data()); + if (rc != ppl::common::RC_SUCCESS) { + std::cerr << "warm up failed, isa=" << isa_name << "\n"; + return false; + } + } + + double total_us = 0.0; + double min_us = DBL_MAX; + int32_t iter = 0; + for (; iter < Flag_min_iter || total_us < Flag_min_second * 1e6; ++iter) { + auto begin = std::chrono::high_resolution_clock::now(); + auto rc = ppl::kernel::x86::abs_fp32(isa, &shape, src.data(), dst.data()); + auto end = std::chrono::high_resolution_clock::now(); + if (rc != ppl::common::RC_SUCCESS) { + std::cerr << "benchmark run failed, isa=" << isa_name << "\n"; + return false; + } + const double elapsed_us = std::chrono::duration_cast>(end - begin).count(); + total_us += elapsed_us; + min_us = std::min(min_us, elapsed_us); + } + + const double avg_us = total_us / iter; + const double bytes = static_cast(Flag_len) * sizeof(float) * 2.0; + const double max_gbps = bytes / min_us / 1e3; + const double avg_gbps = bytes / avg_us / 1e3; + + std::cout << "%isa,len,num_threads,min_ms,max_gbps,avg_ms,avg_gbps,iters\n"; + std::cout << isa_name << "," + << Flag_len << "," + << num_threads << "," + << min_us / 1e3 << "," + << max_gbps << "," + << avg_us / 1e3 << "," + << avg_gbps << "," + << iter << std::endl; + return true; +} + +int main(int argc, char **argv) +{ + simple_flags::parse_args(argc, argv); + if (Flag_help) { + simple_flags::print_args_info(); + return 0; + } + + std::map isa_map = { + {"noarch", ppl::common::ISA_UNKNOWN}, + {"sse", ppl::common::ISA_X86_SSE}, + {"avx", ppl::common::ISA_X86_AVX}, + }; + + ppl::common::isa_t isa = ppl::common::ISA_UNKNOWN; + if (Flag_isa == "auto") { + isa = ppl::common::GetCpuISA(); + } else { + auto it = isa_map.find(Flag_isa); + if (it == isa_map.end()) { + std::cerr << "unsupported isa: " << Flag_isa << "\n"; + return -1; + } + isa = it->second; + } + + if (Flag_benchmark) { + return benchmark_abs(isa, Flag_isa) ? 0 : -1; + } + + if (!test_corner_cases(isa, Flag_isa)) { + return -1; + } + if (!test_random_cases(isa, Flag_isa)) { + return -1; + } + + std::cout << "test_abs pass" << std::endl; + return 0; +} From 579f436aa2ec42d08ccef975389194105fa68c61 Mon Sep 17 00:00:00 2001 From: alanzhai219 Date: Tue, 12 May 2026 15:45:39 +0800 Subject: [PATCH 2/2] fix: x86: fix the abs avx2/sse error --- src/ppl/kernel/x86/fp32/abs/abs_fp32_avx.cpp | 6 +++--- src/ppl/kernel/x86/fp32/abs/abs_fp32_sse.cpp | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ppl/kernel/x86/fp32/abs/abs_fp32_avx.cpp b/src/ppl/kernel/x86/fp32/abs/abs_fp32_avx.cpp index 92b62ea..a927c46 100644 --- a/src/ppl/kernel/x86/fp32/abs/abs_fp32_avx.cpp +++ b/src/ppl/kernel/x86/fp32/abs/abs_fp32_avx.cpp @@ -37,8 +37,8 @@ ppl::common::RetCode abs_fp32_avx( for (int64_t i = 0; i < unroll_body; i += unroll_n) { __m256 src0 = _mm256_loadu_ps(x + i + 0 * V_REG_ELTS); __m256 src1 = _mm256_loadu_ps(x + i + 1 * V_REG_ELTS); - __m256 dst0 = _mm256_andnot_ps(src0, vsignbit); - __m256 dst1 = _mm256_andnot_ps(src1, vsignbit); + __m256 dst0 = _mm256_andnot_ps(vsignbit, src0); + __m256 dst1 = _mm256_andnot_ps(vsignbit, src1); _mm256_storeu_ps(y + i + 0 * V_REG_ELTS, dst0); _mm256_storeu_ps(y + i + 1 * V_REG_ELTS, dst1); } @@ -49,4 +49,4 @@ ppl::common::RetCode abs_fp32_avx( return ppl::common::RC_SUCCESS; } -}}}; // namespace ppl::kernel::x86 \ No newline at end of file +}}}; // namespace ppl::kernel::x86 diff --git a/src/ppl/kernel/x86/fp32/abs/abs_fp32_sse.cpp b/src/ppl/kernel/x86/fp32/abs/abs_fp32_sse.cpp index 1132d72..32e9125 100644 --- a/src/ppl/kernel/x86/fp32/abs/abs_fp32_sse.cpp +++ b/src/ppl/kernel/x86/fp32/abs/abs_fp32_sse.cpp @@ -39,10 +39,10 @@ ppl::common::RetCode abs_fp32_sse( __m128 src1 = _mm_loadu_ps(x + i + 1 * V_REG_ELTS); __m128 src2 = _mm_loadu_ps(x + i + 2 * V_REG_ELTS); __m128 src3 = _mm_loadu_ps(x + i + 3 * V_REG_ELTS); - __m128 dst0 = _mm_andnot_ps(src0, vsignbit); - __m128 dst1 = _mm_andnot_ps(src1, vsignbit); - __m128 dst2 = _mm_andnot_ps(src2, vsignbit); - __m128 dst3 = _mm_andnot_ps(src3, vsignbit); + __m128 dst0 = _mm_andnot_ps(vsignbit, src0); + __m128 dst1 = _mm_andnot_ps(vsignbit, src1); + __m128 dst2 = _mm_andnot_ps(vsignbit, src2); + __m128 dst3 = _mm_andnot_ps(vsignbit, src3); _mm_storeu_ps(y + i + 0 * V_REG_ELTS, dst0); _mm_storeu_ps(y + i + 1 * V_REG_ELTS, dst1); _mm_storeu_ps(y + i + 2 * V_REG_ELTS, dst2); @@ -55,4 +55,4 @@ ppl::common::RetCode abs_fp32_sse( return ppl::common::RC_SUCCESS; } -}}}; // namespace ppl::kernel::x86 \ No newline at end of file +}}}; // namespace ppl::kernel::x86