GPU speed up, matrix multiplication
22 visualizzazioni (ultimi 30 giorni)
Mostra commenti meno recenti
I use GPU (Tesla K80) to speed up the matrix multiplication in matlab 2016a and cuda 7.5. At first, the procedure runs fast, about 0.0001S per loop, after a certain number of iterations, the procedure runs slowly, about 0.04S per loop. ############################ main.m clear; A = 100 * 100000; C = 100 * 100000; for i = 1:10000 tic; B = MatrixMul(A, C); toc; end ############## MatrixMul.cu

if true
#include "mex.h" #include "gpu/mxGPUArray.h"
void _global_ TimesTwo(double const * const A, double const * const C, double * const B, int const N, int const rowsA, int const rowsC, int const colsA, int const colsC) { int const i = blockDim.x * blockIdx.x + threadIdx.x; int j; if (i < rowsA * rowsC) { int co_x = i % rowsA; int co_y = i / rowsA; B[i] = 0; for (j = 0; j < colsA; j++) { B[i] += A[ rowsA * j + co_x] * C[ rowsC * j + co_y]; } } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) { mxGPUArray const *A; mxGPUArray const *C; mxGPUArray *B;
double const *d_A; double const *d_C; double *d_B; int N;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; char const * const errMsg = "Invalid input to MEX file."; int const threadsPerBlock = 256; int blocksPerGrid;
mxInitGPU(); A = mxGPUCreateFromMxArray(prhs[0]); C = mxGPUCreateFromMxArray(prhs[1]);
d_A = (double const *)(mxGPUGetDataReadOnly(A)); d_C = (double const *)(mxGPUGetDataReadOnly(C));
const mwSize *dimsA = mxGPUGetDimensions(A); const mwSize *dimsC = mxGPUGetDimensions(C);
size_t nrowsA = dimsA[0]; size_t ncolsA = dimsA[1]; size_t nrowsC = dimsC[0]; size_t ncolsC = dimsC[1]; mwSize dims[2] = {nrowsA, nrowsC};
B = mxGPUCreateGPUArray(2, dims, mxGPUGetClassID(A), mxGPUGetComplexity(A), MX_GPU_DO_NOT_INITIALIZE); d_B = (double *)(mxGPUGetData(B));
N = (int)(nrowsA * nrowsC); blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; TimesTwo<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_B, N, nrowsA, nrowsC, ncolsA, ncolsC);
plhs[0] = mxGPUCreateMxArrayOnGPU(B); mxGPUDestroyGPUArray(A); mxGPUDestroyGPUArray(B); mxGPUDestroyGPUArray(C); }

<<

>>
end
0 Commenti
Risposte (1)
Joss Knight
il 28 Apr 2018
tic and toc are not giving the correct timings for your first set of iterations, because your kernels are launching asynchronously. You need to use gputimeit or add a call to wait(gpuDevice).
Also, your kernel is not efficient, you should be using cublas to perform matrix multiplication.
0 Commenti
Vedere anche
Categorie
Scopri di più su GPU CUDA and MEX Programming in Help Center e File Exchange
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!