-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathgemm_gpu_mult_thread.cu
More file actions
35 lines (32 loc) · 814 Bytes
/
gemm_gpu_mult_thread.cu
File metadata and controls
35 lines (32 loc) · 814 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include "gemm_gpu_mult_thread.h"
#include <cuda_runtime_api.h>
// gemm_gpu_mult_thread - GEMM on GPU, using only one block
// The block size is N
__global__
void gemm_gpu_mult_thread_kernel(
int* __restrict__ C, // [n, m], on gpu
const int* __restrict__ A, // [n, k], on gpu
const int* __restrict__ B, // [k, m], on gpu
const int n,
const int m,
const int k
) {
const int i = threadIdx.x;
for (int j = 0; j < m; ++j) {
int res = 0;
for (int l = 0; l < k; ++l) {
res += A[i * k + l] * B[l * m + j];
}
C[i * m + j] = res;
}
}
void gemm_gpu_mult_thread(
int* __restrict__ C, // [n, m], on gpu
const int* __restrict__ A, // [n, k], on gpu
const int* __restrict__ B, // [k, m], on gpu
const int n,
const int m,
const int k
) {
gemm_gpu_mult_thread_kernel<<<1, n>>>(C, A, B, n, m, k);
}