I am trying to use TVM’s auto scheduler in combination with the OpenCL target to tune convolutions on Intel CPU. The problem I am facing is that all tested programs fail with a compilation error:
No: 1 GFLOPS: 0.00 / 0.00 results: MeasureResult(error_type:RuntimeDeviceError, error_msg:Traceback (most recent call last):
File ".../python/tvm/auto_scheduler/measure.py", line 1112, in _timed_rpc_run
func.entry_func(*args)
File "...
...
1) * 1024)) + (xx_inner * 64)) + ((((int)get_group_id(0)) & 1) * 32)) + ((((int)get_local_id(0)) & 3) * 8)) + ff_inner) + 512))] = conv_local[((((((nn_inner * 256) + (yy_inner * 64)) + (xx_inner * 8)) + ff_inner) + 512))];
^
Compilation failed
I have defined the convolution like this:
@auto_scheduler.register_workload
def conv2d_layer(N, H, W, P, Q, CO, CI, KH, KW, stride, padding):
data = te.placeholder((N, H, W, CI), name="data", dtype="float32")
kernel = te.placeholder((CO, KH, KW, CI), name="kernel", dtype="float32")
# Create reduction variables
rc = te.reduce_axis((0, CI), name="rc")
ry = te.reduce_axis((0, KH), name="ry")
rx = te.reduce_axis((0, KW), name="rx")
# Compute the convolution
conv = te.compute(
(N, P, Q, CO),
lambda nn, yy, xx, ff: te.sum(
data[nn, yy * stride[0] + ry, xx * stride[1] + rx, rc] * kernel[ff, ry, rx, rc], axis=[ry, rx, rc]
),
name="conv",
)
return [data, kernel, conv]
This is how I start the tuning:
dev = tvm.opencl(0)
target = tvm.target.Target("opencl")
max_shared_memory_per_block = dev.max_shared_memory_per_block
max_local_memory_per_block = 2147483647 # INT32_MAX
max_threads_per_block = dev.max_threads_per_block
max_vthread_extent = int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size
warp_size = dev.warp_size
hardware_params = auto_scheduler.HardwareParams(-1, 16, 64,
max_shared_memory_per_block, max_local_memory_per_block,
max_threads_per_block, max_vthread_extent, warp_size)
task = auto_scheduler.SearchTask(func=conv2d_layer, args=(N, H, W, P, Q, CO, CI, KH, KW, strides, padding), target=target, hardware_params=hardware_params)
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=trials,
measure_callbacks=[auto_scheduler.RecordToFile(logfile)],
verbose=2,
runner=measure_ctx.runner
)
task.tune(tune_option)
With the CUDA backend, this code works just fine.
Many thanks for your help!