How to handle Complex input in MEX gateway function in CUDA?
7 visualizzazioni (ultimi 30 giorni)
Mostra commenti meno recenti
Moein Mozaffarzadeh
il 18 Giu 2021
Commentato: Edric Ellis
il 21 Giu 2021
Hi,
I'm trying to write a MEX gateway function to add two complex vectors. I use thrust to do the processing. However, when i want to compile the code in Matlab, i get this error :
Error using mex
C:/Users/moein.m/Documents/C++/ImageReconstruction_VisualStudioCode/Project7_TUI_CUDA3/TUI_CUDA/TUI_CUDA/test2_GPUArray_Complex.cu(55):
error: identifier "mxGetComplexDoubles" is undefined
1 error detected in the compilation of
"C:/Users/moein.m/Documents/C++/ImageReconstruction_VisualStudioCode/Project7_TUI_CUDA3/TUI_CUDA/TUI_CUDA/test2_GPUArray_Complex.cu".
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures
are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress
warning).
test2_GPUArray_Complex.cu
Error in mexcuda (line 168)
[varargout{1:nargout}] = mex(mexArguments{:});
Error in test2_GPUArray_matlabRunner (line 4)
mexcuda('-v', 'test2_GPUArray_Complex.cu' , 'NVCCFLAGS=-gencode=arch=compute_50,code=sm_50 -Xptxas
-dlcm=cg');
Here is my code:
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include "cuda.h"
#include <iostream>
#include <mex.h>
#include "gpu/mxGPUArray.h"
//#include <cuComplex.h>
//#include <cublas_v2.h>
#include <thrust/complex.h>
#include "matrix.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
typedef thrust::complex<float> fcomp;
__global__ void add(fcomp * Device_DataRes, fcomp * Device_Data1, fcomp * Device_Data2, int N) {
int TID = threadIdx.y * blockDim.x + threadIdx.x;
int BlockOFFset = blockDim.x * blockDim.y * blockIdx.x;
int GID_RowBased = BlockOFFset + TID;
if (GID_RowBased < N) {
Device_DataRes[GID_RowBased] = Device_Data1[GID_RowBased] + Device_Data2[GID_RowBased];
}
}
void mexFunction(int nlhs, mxArray* plhs[],
int nrhs, const mxArray* prhs[]) {
int N = 1000;
int ArrayByteSize = sizeof(fcomp) * N;
fcomp* Data1;
fcomp* Device_Data1;
fcomp* Data2;
fcomp* Device_Data2;
fcomp* DataRes;
fcomp* Device_DataRes;
Data1 = static_cast<fcomp*>(mxGetComplexDoubles(prhs[0]));
//Data1 = (fcomp*)(mxGetComplexDoubles(prhs[0]));
gpuErrchk(cudaMalloc((void**)&Device_Data1, ArrayByteSize));
gpuErrchk(cudaMemcpy(Device_Data1, Data1, ArrayByteSize, cudaMemcpyHostToDevice));
Data2 = static_cast<fcomp*>(mxGetComplexDoubles(prhs[1]));
gpuErrchk(cudaMalloc((void**)&Device_Data2, ArrayByteSize));
gpuErrchk(cudaMemcpy(Device_Data2, Data2, ArrayByteSize, cudaMemcpyHostToDevice));
plhs[0] = mxCreateNumericMatrix(N, 1, mxSINGLE_CLASS, mxCOMPLEX);
DataRes = static_cast<fcomp*> (mxGetData(plhs[0]));
gpuErrchk(cudaMalloc((void**)&Device_DataRes, ArrayByteSize));
dim3 block(1024);
int GridX = (N / block.x + 1);
dim3 grid(GridX);//SystemSetup.NumberOfTransmitter
add << <grid, block >> > (Device_DataRes, Device_Data1, Device_Data2, N);
gpuErrchk(cudaMemcpy(DataRes, Device_DataRes, ArrayByteSize, cudaMemcpyDeviceToHost));
cudaFree(Device_Data1);
cudaFree(Device_Data2);
cudaFree(Device_DataRes);
}
Could you please let me know what is wrong here? It seems that "mxGetComplexDoubles" is making the problem.
Regards,
Moein.
0 Commenti
Risposta accettata
Edric Ellis
il 18 Giu 2021
Firstly, as per the doc page for interleaved complex data, you need to add the command-line flag -R2018a to use mxGetComplexDoubles. But actually I think given that you are casting to thrust::complex<float>, you actually should use mxGetComplexSingles.
You can't directly cast between the mxComplexSingle* returned by mxGetComplexSingles and thrust::complex<float>, but it should still work to perform the memcpy operations. In other words, you need to do this:
mxComplexSingle * Data1 = mxGetComplexSingles(prhs[0]);
thrust::complex<float> * Device_Data1;
gpuErrchk(cudaMalloc((void**)&Device_Data1, ArrayByteSize));
gpuErrchk(cudaMemcpy(Device_Data1, Data1, ArrayByteSize, cudaMemcpyHostToDevice));
3 Commenti
Edric Ellis
il 21 Giu 2021
It doesn't look like atomicAdd is what you want here - that's for scalar values I think. In this case, I'd simply return the gpuArray data back to MATLAB and use the builtin overload of + for gpuArray.
Più risposte (0)
Vedere anche
Categorie
Scopri di più su Startup and Shutdown in Help Center e File Exchange
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!