In my understanding, autotvm implements operators based on opreation and schedule defination. For example, there are pack and no pack implementations in x86/dense.py. In the process of autotuning, the implementation-defined configs are fed into the actual measurement, but No such correctness verification similar to the tvm.testing.assert_allclose() interface was found in the entire tuning process. So I completely extracted the pack and nopack in x86/dense.py and checked the result of tvm.testing.assert_allclose(), and found:
- If the segmented axis is implemented in opreation, the tile size must be divisible, otherwise the calculation result cannot be guaranteed to be correct;
- Other axes that are not defined in the opreation in advance can be divisible by not necessarily;
- Next, I observed the lower difference between them, and found that the code generated with the tile size that cannot be divisible in 1 is wrong;
- So I think this should be a code generation bug.
For example, I test x86/dense.py(nopack implement) M N K = 1 1000 512 and Tm Tn Tk = 1 100 10
produce compute {
parallel (y.outer.x.outer.fused, 0, 10) {
produce compute {
for (z.y.fused.init, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1000) + (z.y.fused.init*10)), 1, 10)] = x10(0f)
}
for (k, 0, 51) {
for (z.y.fused, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1000) + (z.y.fused*10)), 1, 10)] = (compute[ramp(((y.outer.x.outer.fused*1000) + (z.y.fused*10)), 1, 10)] + (data[ramp((k*10), 1, 10)]*weight[ramp((((y.outer.x.outer.fused*51200) + (z.y.fused*512)) + (k*10)), 1, 10)]))
}
}
}
for (x.inner, 0, 100) {
compute[((y.outer.x.outer.fused*100) + x.inner)] = 0f
for (kk, 0, 10) {
compute[((y.outer.x.outer.fused*100) + x.inner)] = (compute[((y.outer.x.outer.fused*100) + x.inner)] + compute[(((y.outer.x.outer.fused*1000) + (x.inner*10)) + kk)])
}
}
}
}
Traceback (most recent call last):
File "OpsGemm/gemm_v3_scheduling.py", line 388, in <module>
buildandevaluation(s, data, weight, out, a, bt, ct, ctx, ct_np)
File "OpsGemm/gemm_v3_scheduling.py", line 44, in buildandevaluation
tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
File "/root/tvm/python/tvm/testing.py", line 29, in assert_allclose
np.testing.assert_allclose(actual, desired, rtol=rtol, atol=atol, verbose=True)
File "/anaconda3/lib/python3.7/site-packages/numpy/testing/_private/utils.py", line 1452, in assert_allclose
verbose=verbose, header=header, equal_nan=equal_nan)
File "/anaconda3/lib/python3.7/site-packages/numpy/testing/_private/utils.py", line 789, in assert_array_compare
raise AssertionError(msg)
AssertionError:
Not equal to tolerance rtol=1e-05, atol=1e-07
(mismatch 100.0%)
x: array([128.66817 , 119.130806, 129.70555 , 126.0419 , 126.232285,
129.39488 , 128.2362 , 124.842926, 128.9357 , 126.89033 ,
132.58101 , 128.5313 , 129.82468 , 129.89973 , 125.16623 ,...
y: array([128.91081 , 119.22253 , 130.08371 , 126.27693 , 126.305466,
129.55656 , 128.48164 , 124.97862 , 129.08452 , 127.26959 ,
132.87903 , 128.80084 , 129.95262 , 130.14275 , 125.569626,...
if M N K = 1 1000 512 and Tm Tn Tk = 1 100 16
produce compute {
parallel (y.outer.x.outer.fused, 0, 10) {
produce compute {
for (z.y.fused.init, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1600) + (z.y.fused.init*16)), 1, 16)] = x16(0f)
}
for (k, 0, 32) {
for (z.y.fused, 0, 100) {
compute[ramp(((y.outer.x.outer.fused*1600) + (z.y.fused*16)), 1, 16)] = (compute[ramp(((y.outer.x.outer.fused*1600) + (z.y.fused*16)), 1, 16)] + (data[ramp((k*16), 1, 16)]*weight[ramp((((y.outer.x.outer.fused*51200) + (z.y.fused*512)) + (k*16)), 1, 16)]))
}
}
}
for (x.inner, 0, 100) {
compute[((y.outer.x.outer.fused*100) + x.inner)] = 0f
for (kk, 0, 16) {
compute[((y.outer.x.outer.fused*100) + x.inner)] = (compute[((y.outer.x.outer.fused*100) + x.inner)] + compute[(((y.outer.x.outer.fused*1600) + (x.inner*16)) + kk)])
}
}
}
}
time: 0.000062
if M N K = 1 1000 512 and Tm Tn Tk = 1 23 16
produce compute {
parallel (y.outer.x.outer.fused, 0, 44) {
produce compute {
for (z.y.fused.init, 0, 23) {
compute[ramp(((y.outer.x.outer.fused*368) + (z.y.fused.init*16)), 1, 16)] = x16(0f)
}
for (k, 0, 32) {
for (z.y.fused, 0, 23) {
if (likely((((y.outer.x.outer.fused*23) + z.y.fused) < 1000))) {
compute[ramp(((y.outer.x.outer.fused*368) + (z.y.fused*16)), 1, 16)] = (compute[ramp(((y.outer.x.outer.fused*368) + (z.y.fused*16)), 1, 16)] + (data[ramp((k*16), 1, 16)]*weight[ramp((((y.outer.x.outer.fused*11776) + (z.y.fused*512)) + (k*16)), 1, 16)]))
}
}
}
}
for (x.inner, 0, 23) {
if (likely((((y.outer.x.outer.fused*23) + x.inner) < 1000))) {
compute[((y.outer.x.outer.fused*23) + x.inner)] = 0f
}
for (kk, 0, 16) {
if (likely((((y.outer.x.outer.fused*23) + x.inner) < 1000))) {
if (likely((((y.outer.x.outer.fused*23) + x.inner) < 1000))) {
compute[((y.outer.x.outer.fused*23) + x.inner)] = (compute[((y.outer.x.outer.fused*23) + x.inner)] + compute[(((y.outer.x.outer.fused*368) + (x.inner*16)) + kk)])
}
}
}
}
}
}
time: 0.000958
(ps:In order to observe the lower code, I closed all unroll operations)
Is this what I missed? And how does autotvm ensure the correctness of the results in the autotuning process?