I am trying to write a compute and schedule that generates C code for a custom accelerator. I am doing a simple convolution first with NCHW layout. There are several constraints:
-
input, kernel and conv output buffers (or sub-buffers) need to be cached. This is achieved with DMA copies using the Inject Copy Intrinsic functionality
-
the conv computation needs to be done using an intrinsic
temp = my_pad(Input, pad_before, pad_after, name="pad_temp") weight_buf = te.compute((num_filter, channel, kernel_h, kernel_w), lambda *i: Filter(*i), "weight_buf") conv_result = te.compute( (batch, out_channel, out_height, out_width), lambda nn, ff, yy, xx: te.sum( temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(out_dtype) * weight_buf[ff, rc, ry, rx].astype(out_dtype), axis=[rc, ry, rx], ), name="conv_result", tag="conv2d_nchw", ) out_buf = te.compute((batch, in_channel, in_height, in_width), lambda *i: conv_result(*i), "out_buf")
and the schedule is:
conv_ax_0,conv_axis_ff,conv_axis_yy,conv_axis_xx = op_by_name["conv_result"].axis
rc,ry,rx = op_by_name["conv_result"].reduce_axis
s[op_by_name["pad_temp"]].compute_at(s[op_by_name["conv_result"]], conv_axis_yy)
s[op_by_name["pad_temp"]].pragma(s[op_by_name["pad_temp"]].op.axis[2], "dma_copy")
s[op_by_name["pad_temp"]].set_scope("local.inp_buffer")
s[op_by_name["weight_buf"]].compute_at(s[op_by_name["conv_result"]], conv_axis_ff)
s[op_by_name["weight_buf"]].pragma(s[op_by_name["weight_buf"]].op.axis[0], "dma_copy")
s[op_by_name["weight_buf"]].set_scope("local.wgt_buffer")
s[op_by_name["conv_result"]].compute_at(s[op_by_name["out_buf"]], s[op_by_name["out_buf"]].op.axis[1])
s[op_by_name["conv_result"]].set_scope("local.acc_buffer")
_, in_channel, in_height, in_width = op_by_name["conv_result"].input_tensors[0].shape
_, out_channels, k_height, k_width = op_by_name["conv_result"].input_tensors[1].shape
ciha_prod = intrin_dotproduct(out_channels, in_channel, in_width, k_height, k_width)
#print(ciha_prod)
s[op_by_name["conv_result"]].tensorize(rc, ciha_prod)
s[op_by_name["out_buf"]].pragma(s[op_by_name["out_buf"]].op.axis[2], "dma_copy")
s[op_by_name["out_buf"]].parallel(s[op_by_name["out_buf"]].op.axis[1])
However, when I try to tensorize “conv_result” I get an error:
(info->head_address.defined()) is false: conv_result is not adddressable
Without tensorize when inspecting the C code everything looks ok (note the DMA intrinsic and buffers scopes are taken from VTA):
void* weight_buf = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)18432, 2, 32);
if (weight_buf == NULL) {
return -1;
}
void* conv_result = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)196, 2, 32);
if (conv_result == NULL) {
return -1;
}
void* pad_temp = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)55296, 2, 32);
if (pad_temp == NULL) {
return -1;
}
(void)VTALoadBuffer2D(ctx_cache_, placeholder1, (i1 * 18), 1, 4608, 0, 0, 0, 0, 0, 0, weight_buf, 5);
for (int32_t yy = 0; yy < 7; ++yy) {
for (int32_t i11 = 0; i11 < 512; ++i11) {
int32_t _1 = 1 - yy;
int32_t _2 = yy - 5;
(void)VTALoadBuffer2D(ctx_cache_, placeholder, (((((i11 * 49) + (yy * 7)) + (((_1) > (0) ? (_1) : (0)) * 7)) - 7) >> 4), 1, 1, 1, 1, ((_1) > (0) ? (_1) : (0)), 1, ((_2) > (0) ? (_2) : (0)), ((i11 * 27) / 4), pad_temp, 5);
}
for (int32_t xx = 0; xx < 7; ++xx) {
((float*)conv_result)[(((yy * 7) + xx))] = 0.000000e+00f;
for (int32_t rc = 0; rc < 512; ++rc) {
for (int32_t ry = 0; ry < 3; ++ry) {
for (int32_t rx = 0; rx < 3; ++rx) {
((float*)conv_result)[(((yy * 7) + xx))] = (((float*)conv_result)[(((yy * 7) + xx))] + (((float*)pad_temp)[(((((rc * 27) + (ry * 9)) + xx) + rx))] * ((float*)weight_buf)[((((rc * 9) + (ry * 3)) + rx))]));
}
}
}
}
}
(void)VTAStoreBuffer2D(ctx_cache_, 0, conv_result, 4, out_buf, ((i1 * 49) >> 4), 1, 49, 0);
Do you have any ideas what this error means ?