CUDA-From-Correctness-To-Performance-Code/gemm_gpu_mult_thread.cu at master · interestingLSY/CUDA-From-Correctness-To-Performance-Code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include "gemm_gpu_mult_thread.h"

#include <cuda_runtime_api.h>

// gemm_gpu_mult_thread - GEMM on GPU, using only one block
// The block size is N
__global__
void gemm_gpu_mult_thread_kernel(
	int* __restrict__ C,		// [n, m], on gpu
	const int* __restrict__ A,	// [n, k], on gpu
	const int* __restrict__ B,	// [k, m], on gpu
	const int n,
	const int m,
	const int k
) {
	const int i = threadIdx.x;
	for (int j = 0; j < m; ++j) {
		int res = 0;
		for (int l = 0; l < k; ++l) {
			res += A[i * k + l] * B[l * m + j];
		}
		C[i * m + j] = res;
	}
}

void gemm_gpu_mult_thread(
	int* __restrict__ C,		// [n, m], on gpu
	const int* __restrict__ A,	// [n, k], on gpu
	const int* __restrict__ B,	// [k, m], on gpu
	const int n,
	const int m,
	const int k
) {
	gemm_gpu_mult_thread_kernel<<<1, n>>>(C, A, B, n, m, k);
}