CUDALaunch Error: CUDA_ERROR_INVALID_VALUE

I met this question when i tried conv2d with extreme big channels.

x = relay.var("x", shape=[1, 256, 100, 100])
y = relay.var("y", shape=[65536, 1, 3, 3])
conv = relay.nn.conv2d(x, y, strides=(1, 1), padding=(1, 1), kernel_size=(3, 3), groups=256)
func = relay.Function((x, y), conv)
mod = tvm.IRModule.from_expr(func)

Then I got this:

TVMError: Check failed: ret == 0 (-1 vs. 0) : CUDALaunch Error: CUDA_ERROR_INVALID_VALUE
 grid=(10000,65536,1),  block=(1,1,1)
// func_name=fused_nn_conv2d_kernel0
// CUDA Source
// -----------
extern "C" __global__ void fused_nn_conv2d_kernel0(float* __restrict__ placeholder, float* __restrict__ placeholder1, float* __restrict__ compute) {
  float compute_local[1];
  __shared__ float pad_temp_shared[1];
  __shared__ float placeholder_shared[1];
  compute_local[(0)] = 0.000000e+00f;
  for (int ry_outer = 0; ry_outer < 3; ++ry_outer) {
    for (int rx_outer = 0; rx_outer < 3; ++rx_outer) {
      pad_temp_shared[(0)] = (((((1 <= ((((int)blockIdx.x) / 100) + ry_outer)) && (((((int)blockIdx.x) / 100) + ry_outer) < 101)) && (1 <= (rx_outer + (((int)blockIdx.x) % 100)))) && ((rx_outer + (((int)blockIdx.x) % 100)) < 101)) ? placeholder[(((((((((int)blockIdx.y) >> 8) * 10000) + (ry_outer * 100)) + ((int)blockIdx.x)) + rx_outer) - 101))] : 0.000000e+00f);
      placeholder_shared[(0)] = placeholder1[((((((int)blockIdx.y) * 9) + (ry_outer * 3)) + rx_outer))];
      compute_local[(0)] = (compute_local[(0)] + (pad_temp_shared[(0)] * placeholder_shared[(0)]));
    }
  }
  compute[(((((int)blockIdx.y) * 10000) + ((int)blockIdx.x)))] = compute_local[(0)];
}

I can’t avoid this group convolution in another big program.

Does anyone know how to solve this problem?Help, plzzzzzz :face_with_thermometer:

I also meet this problem

TVMError: CUDALaunch Error: CUDA_ERROR_INVALID_VALUE
 grid=(2048,1,1),  block=(2048,1,1)

Maybe we need let elements larger than threads number