Config for OpenCL causes * NDRANGE_KERNEL executed abnormally *

sunzj · September 30, 2019, 2:21am

I wrote a conv2d schedule for opencl device, after tunning,i got two config. The first config worked well, but the second config caused "*** NDRANGE_KERNEL executed abnormally *** ".

I dumped the OpenCL source code:

The first one:

__kernel void fuse_conv2d_clip_39_kernel0(__global float* restrict input0, __global float* restrict input1, __global float* restrict tensor, __global float* restrict input2) { float compute[24]; __local float pad_temp_shared[224]; __local float input1_shared[48]; float pad_temp_shared_local[1]; float input1_shared_local[24]; for (int ff_init = 0; ff_init < 24; ++ff_init) { compute[ff_init] = 0.000000e+00f; } for (int rc_outer = 0; rc_outer < 12; ++rc_outer) { barrier(CLK_LOCAL_MEM_FENCE); for (int ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner = 0; ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) { pad_temp_shared[(((((int)get_local_id(1)) * 16) + ((((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8) * 8)) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8))] = input0[((((((((int)get_group_id(1)) * 784) + (((int)get_group_id(0)) * 8)) + (rc_outer * 6272)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) / 14) * 3136)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) % 14) * 56)) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8))]; } if ((((int)get_local_id(1)) * 2) < (24 - (((int)get_local_id(0)) / 2))) { if ((((int)get_group_id(2)) * 24) < ((144 - (((int)get_local_id(1)) * 2)) - (((int)get_local_id(0)) / 2))) { input1_shared[(((((int)get_local_id(1)) * 4) + ((((int)get_local_id(0)) / 2) * 2)) + (((int)get_local_id(0)) % 2))] = input1[(((((((int)get_group_id(2)) * 576) + (rc_outer * 2)) + (((int)get_local_id(1)) * 48)) + ((((int)get_local_id(0)) / 2) * 24)) + (((int)get_local_id(0)) % 2))]; } } barrier(CLK_LOCAL_MEM_FENCE); for (int rc_inner = 0; rc_inner < 2; ++rc_inner) { pad_temp_shared_local[0] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (rc_inner * 112))]; for (int ax0 = 0; ax0 < 24; ++ax0) { input1_shared_local[ax0] = input1_shared[(rc_inner + (ax0 * 2))]; } for (int ff = 0; ff < 24; ++ff) { compute[ff] = (compute[ff] + (pad_temp_shared_local[0] * input1_shared_local[ff])); } } } for (int ax1_inner_inner = 0; ax1_inner_inner < 24; ++ax1_inner_inner) { tensor[((((((((int)get_group_id(2)) * 75264) + (((int)get_group_id(1)) * 784)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(1)) * 56)) + ((int)get_local_id(0))) + (ax1_inner_inner * 3136))] = max(min((compute[ax1_inner_inner] + input2[((((int)get_group_id(2)) * 24) + ax1_inner_inner)]), 6.000000e+00f), 0.000000e+00f); } }

The second one:

__kernel void fuse_conv2d_clip_39_kernel0(__global float* restrict input0, __global float* restrict input1, __global float* restrict tensor, __global float* restrict input2) { float compute[24]; __local float pad_temp_shared[64]; __local float input1_shared[48]; float pad_temp_shared_local[1]; float input1_shared_local[24]; for (int ff_init = 0; ff_init < 24; ++ff_init) { compute[ff_init] = 0.000000e+00f; } for (int rc_outer = 0; rc_outer < 12; ++rc_outer) { barrier(CLK_LOCAL_MEM_FENCE); for (int ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner = 0; ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) { pad_temp_shared[((((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) / 4) * 32) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) % 4) * 8))] = input0[((((((((int)get_group_id(1)) * 224) + (((int)get_group_id(0)) * 8)) + (rc_outer * 6272)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) / 4) * 3136)) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) % 4) * 56))]; } for (int ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1 = 0; ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1 < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1) { if ((((int)get_local_id(1)) * 6) < (24 - ((int)get_local_id(0)))) { if (((((int)get_group_id(2)) * 24) + (((int)get_local_id(1)) * 6)) < (144 - ((int)get_local_id(0)))) { input1_shared[(((((int)get_local_id(1)) * 12) + (((int)get_local_id(0)) * 2)) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1)] = input1[(((((((int)get_group_id(2)) * 576) + (rc_outer * 2)) + (((int)get_local_id(1)) * 144)) + (((int)get_local_id(0)) * 24)) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1)]; } } } barrier(CLK_LOCAL_MEM_FENCE); for (int rc_inner = 0; rc_inner < 2; ++rc_inner) { pad_temp_shared_local[0] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (rc_inner * 32))]; for (int ax0 = 0; ax0 < 24; ++ax0) { input1_shared_local[ax0] = input1_shared[(rc_inner + (ax0 * 2))]; } for (int ff = 0; ff < 24; ++ff) { compute[ff] = (compute[ff] + (pad_temp_shared_local[0] * input1_shared_local[ff])); } } } for (int ax1_inner_inner = 0; ax1_inner_inner < 24; ++ax1_inner_inner) { tensor[((((((((int)get_group_id(2)) * 75264) + (((int)get_group_id(1)) * 224)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(1)) * 56)) + ((int)get_local_id(0))) + (ax1_inner_inner * 3136))] = max(min((compute[ax1_inner_inner] + input2[((((int)get_group_id(2)) * 24) + ax1_inner_inner)]), 6.000000e+00f), 0.000000e+00f); } }

Seems the error causes by the NDRANGE exceed the limit, but i am not sure, could someone explain it, thanks very much!

maksimon · November 19, 2021, 7:56am

Hi ~ Have you sloved this issue now?

Config for OpenCL causes *** NDRANGE_KERNEL executed abnormally ***

Config for OpenCL causes * NDRANGE_KERNEL executed abnormally *