Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# ppl.kernel.cpu

`ppl.kernel.cpu` 是 OpenPPL 的 CPU kernel 仓库,包含 x86、ARM、RISC-V 等后端的算子实现、头文件和测试程序。

常用目录:

- `include/`: 对外头文件
- `src/`: kernel 实现
- `test/`: 测试与 benchmark
- `cmake/`: 各架构构建脚本

## 编译

x86 默认不会自动打开,推荐使用单独构建目录:

```bash
cd //ppl.kernel.cpu
mkdir -p build
cd build
cmake -DPPLNN_USE_X86_64=ON -DPPLNN_BUILD_TESTS=ON -DPPLNNX86Kernel_HOLD_DEPS=ON ..
cmake --build . -j4
```

只编译某个测试目标:

```bash
cmake --build . --target test_abs -j4
```

常用开关:

- `PPLNN_USE_X86_64=ON`: 打开 x86 kernel
- `PPLNN_BUILD_TESTS=ON`: 生成测试目标
- `PPLNNX86Kernel_HOLD_DEPS=ON`: 冻结当前依赖
- `PPLNN_USE_OPENMP=ON`: 打开 OpenMP,多线程 benchmark 需要它

## 如何新加算子

以 x86 fp32 为例,推荐沿用这组结构:

- `include/ppl/kernel/x86/fp32/my_op.h`
- `src/ppl/kernel/x86/fp32/my_op/my_op_fp32.cpp`
- `src/ppl/kernel/x86/fp32/my_op/my_op_fp32_sse.cpp`
- `src/ppl/kernel/x86/fp32/my_op/my_op_fp32_avx.cpp`
- `test/test_my_op.cpp`

约定:

- `*_fp32.cpp`: 参考实现和 ISA 分发
- `*_sse.cpp` / `*_avx.cpp` / `*_fma.cpp` / `*_avx512.cpp`: ISA 特化实现
- 统一入口先判断 `isa`,再调用对应实现,否则退回 `ref`

x86 源文件在 `cmake/x86.cmake` 中通过 `GLOB_RECURSE` 自动收集,通常不需要手动登记源码文件。

## 如何测试算子

当前仓库是“一个测试目标对应一个可执行文件”,不是 gtest 风格。

新增测试时:

1. 在 `test/` 下增加 `test_my_op.cpp`
2. 在 `cmake/x86.cmake` 的 `PPLKERNELX86_TESTS` 里追加 `test_my_op`
3. 编译并运行

```bash
cd //ppl.kernel.cpu/build
cmake --build . --target test_my_op -j4
./test_my_op
```

注意:`test/` 里的程序使用 `simple_flags`,参数是单横线形式,例如 `-isa=avx`。

`test_abs` 的 correctness 用法:

```bash
./test_abs
./test_abs -isa=noarch
./test_abs -isa=sse
./test_abs -isa=avx
```

## 如何测试性能

现有 `test_gemm`、`test_conv2d`、`test_pd_conv2d` 本身就带 benchmark 参数,例如:

```bash
./test_gemm -cfg=/path/to/gemm.cfg -isa=fma -warm_up=5 -min_iter=50 -min_second=2
./test_conv2d -cfg=/path/to/conv.cfg -algo=n16cx_gemm_direct_fp32_fma -warm_up=5 -min_iter=20 -min_second=2
```

`test_abs` 现在也支持 benchmark 模式:

```bash
./test_abs -benchmark=true -validate=false -isa=avx -len=16777216 -warm_up=5 -min_iter=50 -min_second=2 -num_threads=1
```

如果构建时打开了 OpenMP,还可以控制线程数和绑核:

```bash
./test_abs -benchmark=true -validate=false -isa=avx -len=16777216 -warm_up=5 -min_iter=50 -min_second=2 -num_threads=8 -core_bind=true
```

`test_abs` benchmark 常用参数:

- `-benchmark`: 打开 benchmark 模式
- `-validate`: benchmark 前先做一次正确性校验
- `-isa`: `noarch`、`sse`、`avx` 或 `auto`
- `-len`: 输入元素个数
- `-warm_up`: 预热次数
- `-min_iter`: 最少迭代次数
- `-min_second`: 最短测试时长
- `-num_threads`: OpenMP 线程数
- `-core_bind`: 是否绑核

输出字段包括 `min_ms`、`avg_ms`、`max_gbps`、`avg_gbps` 和 `num_threads`。

对 `abs` 这类简单 unary 算子,更建议看 `GB/s` 而不是 `GFLOPS`。

如需补充底层性能指标,可以直接配合 `perf stat`:

```bash
perf stat ./test_abs -isa=avx
```
47 changes: 21 additions & 26 deletions cmake/x86.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -113,32 +113,27 @@ endif()
if(PPLNN_BUILD_TESTS)
set(__PPLNN_TOOLS_DIR__ ${CMAKE_CURRENT_SOURCE_DIR}/test)

add_executable(test_conv2d test/test_conv2d.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc)
target_include_directories(test_conv2d
PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES}
PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES})
target_compile_options(test_conv2d PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS})
target_compile_definitions(test_conv2d PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS})
target_compile_features(test_conv2d PRIVATE cxx_std_11)
target_link_libraries(test_conv2d PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES})

add_executable(test_gemm test/test_gemm.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc)
target_include_directories(test_gemm
PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES} ${PPLKERNELX86_INCLUDE_DIRECTORIES}
PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES})
target_compile_options(test_gemm PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS})
target_compile_definitions(test_gemm PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS})
target_compile_features(test_gemm PRIVATE cxx_std_11)
target_link_libraries(test_gemm PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES})

add_executable(test_pd_conv2d test/test_pd_conv2d.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc)
target_include_directories(test_pd_conv2d
PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES} ${PPLKERNELX86_INCLUDE_DIRECTORIES}
PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES})
target_compile_options(test_pd_conv2d PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS})
target_compile_definitions(test_pd_conv2d PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS})
target_compile_features(test_pd_conv2d PRIVATE cxx_std_11)
target_link_libraries(test_pd_conv2d PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES})
function(pplkernelx86_add_test target_name)
add_executable(${target_name} test/${target_name}.cpp ${__PPLNN_TOOLS_DIR__}/simple_flags.cc)
target_include_directories(${target_name}
PUBLIC ${PPLKERNELX86_PUBLIC_INCLUDE_DIRECTORIES}
PRIVATE ${PPLKERNELX86_PRIVATE_INCLUDE_DIRECTORIES} ${__PPLNN_TOOLS_DIR__} ${PPLCOMMON_INCLUDES})
target_compile_options(${target_name} PRIVATE ${PPLKERNELX86_COMPILE_OPTIONS})
target_compile_definitions(${target_name} PRIVATE ${PPLKERNELX86_COMPILE_DEFINITIONS})
target_compile_features(${target_name} PRIVATE cxx_std_11)
target_link_libraries(${target_name} PRIVATE pplkernelx86_static ${PPLKERNELX86_LINK_LIBRARIES})
endfunction()

set(PPLKERNELX86_TESTS)
list(APPEND PPLKERNELX86_TESTS
test_abs
test_conv2d
test_gemm
test_pd_conv2d)

foreach(test_name IN LISTS PPLKERNELX86_TESTS)
pplkernelx86_add_test(${test_name})
endforeach()

unset(__PPLNN_TOOLS_DIR__)
endif()
6 changes: 3 additions & 3 deletions src/ppl/kernel/x86/fp32/abs/abs_fp32_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ ppl::common::RetCode abs_fp32_avx(
for (int64_t i = 0; i < unroll_body; i += unroll_n) {
__m256 src0 = _mm256_loadu_ps(x + i + 0 * V_REG_ELTS);
__m256 src1 = _mm256_loadu_ps(x + i + 1 * V_REG_ELTS);
__m256 dst0 = _mm256_andnot_ps(src0, vsignbit);
__m256 dst1 = _mm256_andnot_ps(src1, vsignbit);
__m256 dst0 = _mm256_andnot_ps(vsignbit, src0);
__m256 dst1 = _mm256_andnot_ps(vsignbit, src1);
_mm256_storeu_ps(y + i + 0 * V_REG_ELTS, dst0);
_mm256_storeu_ps(y + i + 1 * V_REG_ELTS, dst1);
}
Expand All @@ -49,4 +49,4 @@ ppl::common::RetCode abs_fp32_avx(
return ppl::common::RC_SUCCESS;
}

}}}; // namespace ppl::kernel::x86
}}}; // namespace ppl::kernel::x86
10 changes: 5 additions & 5 deletions src/ppl/kernel/x86/fp32/abs/abs_fp32_sse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ ppl::common::RetCode abs_fp32_sse(
__m128 src1 = _mm_loadu_ps(x + i + 1 * V_REG_ELTS);
__m128 src2 = _mm_loadu_ps(x + i + 2 * V_REG_ELTS);
__m128 src3 = _mm_loadu_ps(x + i + 3 * V_REG_ELTS);
__m128 dst0 = _mm_andnot_ps(src0, vsignbit);
__m128 dst1 = _mm_andnot_ps(src1, vsignbit);
__m128 dst2 = _mm_andnot_ps(src2, vsignbit);
__m128 dst3 = _mm_andnot_ps(src3, vsignbit);
__m128 dst0 = _mm_andnot_ps(vsignbit, src0);
__m128 dst1 = _mm_andnot_ps(vsignbit, src1);
__m128 dst2 = _mm_andnot_ps(vsignbit, src2);
__m128 dst3 = _mm_andnot_ps(vsignbit, src3);
_mm_storeu_ps(y + i + 0 * V_REG_ELTS, dst0);
_mm_storeu_ps(y + i + 1 * V_REG_ELTS, dst1);
_mm_storeu_ps(y + i + 2 * V_REG_ELTS, dst2);
Expand All @@ -55,4 +55,4 @@ ppl::common::RetCode abs_fp32_sse(
return ppl::common::RC_SUCCESS;
}

}}}; // namespace ppl::kernel::x86
}}}; // namespace ppl::kernel::x86
Loading