Why do I receive the error "CUDA_ERRO​R_ILLEGAL_​ADDRESS" despite of successfully compiling the cuda mex function.

12 views (last 30 days)
Hello! I'm trying to convert a C/MEX file to Cuda Mex file with MATLAB 2019a, CUDA Toolkit version 10.0 and Visual Studio 2015 Professional. The Graphics Processing Unit in my PC is GeForce RTX 2080 Ti. The Cuda MEX codes are given below:
#include "mex.h"
#include <string.h>
#include "gpu/mxGPUArray.h"
/* Input Arguments */
#define X_IN prhs[0] // Image
#define SZ_IN prhs[1] // Patch Size
#define S_IN prhs[2] // Step Size
/* Output Arguments */
#define B_OUT plhs[0] // Output (Columnized Patches)
void __global__ im2colstep2GPU(double const *x, double *b, mwSize sz[], mwSize stepsize[], mwSize n[]){
mwIndex i, j, k, l, m, blocknum;
mwSize NN = sz[0] * sz[1] *sz[2] ;
mwSize nn = blockDim.x * blockIdx.x + threadIdx.x;
// int total_threads = gridDim.x * blockDim.x;
/* Do the actual computation */
blocknum = 0;
for(; nn<NN; nn++){
/* iterate over all blocks */
for (k=0; k<=n[2]-sz[2]; k+=stepsize[2]) {
for (j=0; j<=n[1]-sz[1]; j+=stepsize[1]) {
for (i=0; i<=n[0]-sz[0]; i+=stepsize[0]) {
/* copy single block */
for (m=0; m<sz[2]; m++) {
for (l=0; l<sz[1]; l++) {
// b[blocknum*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0]] = x[(k + m)*n[0]*n[1] + (j+l)*n[0] + i];
memcpy(b + blocknum*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray*prhs[])
mxGPUArray const *x;
mxGPUArray *b;
double *s;
double const *device_x;
double *device_b;
mwSize sz[3], stepsize[3], n[3], ndims;
/* Check for proper number of arguments */
if (nrhs < 2 || nrhs > 3) {
mexErrMsgTxt("Invalid number of input arguments.");
} else if (nlhs > 1) {
mexErrMsgTxt("Too many output arguments.");
/* Check the the input dimensions */
// ndims = mxGetNumberOfDimensions(X_IN);
// mexPrintf("\n number of dimensions is %d", ndims);
// if (!mxIsDouble(X_IN) || mxIsComplex || ndims>3) {
// mexErrMsgTxt("X should be a 2-D or 3-D double matrix.");
// }
/* Get Input Arrays*/
x = mxGPUCreateFromMxArray(X_IN);
/* Check the the input dimensions */
ndims = mxGPUGetNumberOfDimensions(x);
if (mxGPUGetClassID(x) != mxDOUBLE_CLASS || mxGPUGetClassID(x) == mxCOMPLEX || ndims>3) {
mexErrMsgTxt("X should be a 2-D or 3-D double matrix.");
if (!mxIsDouble(SZ_IN) || mxIsComplex(SZ_IN) || mxGetNumberOfDimensions(SZ_IN)>2 || mxGetM(SZ_IN)*mxGetN(SZ_IN)!=ndims) {
mexErrMsgTxt("Invalid block size.");
if (nrhs == 3) {
if (!mxIsDouble(S_IN) || mxIsComplex(S_IN) || mxGetNumberOfDimensions(S_IN)>2 || mxGetM(S_IN)*mxGetN(S_IN)!=ndims) {
mexErrMsgTxt("Invalid step size.");
/* Get parameters */
s = mxGetPr(SZ_IN);
if (s[0]<1 || s[1]<1 || (ndims==3 && s[2]<1)) {
mexErrMsgTxt("Invalid block size.");
sz[0] = (mwSize)(s[0] + 0.01);
sz[1] = (mwSize)(s[1] + 0.01);
sz[2] = ndims==3 ? (mwSize)(s[2] + 0.01) : 1;
if (nrhs == 3) {
s = mxGetPr(S_IN);
if (s[0]<1 || s[1]<1 || (ndims==3 && s[2]<1)) {
mexErrMsgTxt("Invalid step size.");
stepsize[0] = (mwSize)(s[0] + 0.01);
stepsize[1] = (mwSize)(s[1] + 0.01);
stepsize[2] = ndims==3 ? (mwSize)(s[2] + 0.01) : 1;
else {
stepsize[0] = stepsize[1] = stepsize[2] = 1;
n[0] = (mxGPUGetDimensions(x))[0];
n[1] = (mxGPUGetDimensions(x))[1];
n[2] = ndims==3 ? (mxGPUGetDimensions(x))[2] : 1;
if (n[0]<sz[0] || n[1]<sz[1] || (ndims==3 && n[2]<sz[2])) {
mexErrMsgTxt("Block size too large.");
/* Create a matrix for the return argument */
// b = mxGetPr(B_OUT);
mwSize b_dims[2];
b_dims[0] = (sz[0]*sz[1]*sz[2]);
b_dims[1] = ((n[0]-sz[0])/stepsize[0]+1)*((n[1]-sz[1])/stepsize[1]+1)*((n[2]-sz[2])/stepsize[2]+1);
b = mxGPUCreateGPUArray(2, b_dims, mxGPUGetClassID(x), /* Create the output matrix */
device_x = (double const *)(mxGPUGetDataReadOnly(x));
device_b = (double *)(mxGPUGetData(b));
/* Invoke Kernel*/
int NN = mxGPUGetNumberOfElements(x);
int const threadsPerBlock = 256;
int blocksPerGrid = (NN + threadsPerBlock - 1)/threadsPerBlock;
im2colstep2GPU<<<blocksPerGrid, threadsPerBlock>>>
(device_x, device_b, sz, stepsize, n);
B_OUT = mxGPUCreateMxArrayOnGPU(b);
* the mxGPUArray pointers are host-side structures that refer to
* device data. These must be destroyed before leaving the MEX
* function.
I have succesfully compiled the CUDA Mex function. But, when I enter the code below, I have taken the following error:
>> im2colstep2GPU(gpuArray(A),[3,3])
Error using gpuArray/gather
An unexpected error occurred during CUDA execution. The CUDA error was:
Error in
(line 73)
maybeTruncatedValue = gather( x );
Error in parallel.internal.shared.buildDisplayHelper>iBuildDisplayHelper
(line 33)
dh = iFirstNNumericDisplayHelper( ...
Error in parallel.internal.shared.buildDisplayHelper (line 24)
dh = iBuildDisplayHelper( x, transferDenseFcn, transferSparseFcn,
xClassName, xName, N );
Error in dispInternal (line 13)
dh = parallel.internal.shared.buildDisplayHelper( ...
Error in gpuArray/display (line 21)
dh = dispInternal( obj, thisClassName, objName );
Actually, it seems like that the computation have been accomplished when looking at the workspace(at least, the size of the output matches with the expected output variable). However, when I click on the output variable in the workspace, I take the following figure:
Can anyone help me on the subject of what I do for fixing this error.

Answers (1)

Joss Knight
Joss Knight on 24 Mar 2022
I suspect your kernel is writing into out-of-bounds memory. The next time the device synchronizes the error will be reported, which in your case was when you tried to display the gpuArray. To detect it earlier for debugging, add cudaDeviceSynchronize after your kernel is launched and then inspect the result of cudaGetLastError.
Exactly where your kernel is writing out of bounds will require some debugging. I suggest start by compiling your MEX functions with the -G and -g options (you may also need to add NVCC_FLAGS=-lineinfo as well to narrow it down to a line of code), then using the CUDA toolkit utility cuda-memcheck to detect the illegal access. I tend to write a script that reproduces the error and then launch it from a terminal using matlab -batch, something like
cuda-memcheck matlab -batch "myScript()"
Obviously a more laborious way to do this involves either using the NSight debugger or putting printf statements in your kernel. Note that MEX overloads printf (to display to the MATLAB command window) so you need put #undef printf at the top of your file to stop that happening. Also, try to run your kernel with the smallest possible matrix to avoid millions of lines of output. Note: the display will output to the terminal from which you launched MATLAB. Also, on Windows you need to run MATLAB with the options -wait -log to capture the output stream.
Hope this helps you find the problem. Let us know how you did!

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!

Translated by