Undetected parallelization issue

richard-wwu · July 28, 2022, 2:37pm

It seems that TVM cannot detect the issue that two operation axes are bind to the same thread axis – TVM generates in such case incorrect CUDA code without issuing error messages:

# express Y[i,j] = 2 * X[i,j]
X = te.placeholder((I,J), dtype='float32')
Y = te.compute(
    (I,J),
    lambda i,j: 2 * X[i,j]
)

# create simple CUDA schedule
s = te.create_schedule([Y.op])
s[Y].bind(Y.op.axis[0], te.thread_axis('threadIdx.x'))
s[Y].bind(Y.op.axis[1], te.thread_axis('threadIdx.x'))  # tvm.lower does not detect double bind to threadIdx.x

Complete example:

import tvm
from tvm import te
import numpy as np

# define input size
I = 16
J = 32

# express Y[i,j] = 2 * X[i,j]
X = te.placeholder((I,J), dtype='float32')
Y = te.compute(
    (I,J),
    lambda i,j: 2 * X[i,j]
)

# create simple CUDA schedule
s = te.create_schedule([Y.op])
s[Y].bind(Y.op.axis[0], te.thread_axis('threadIdx.x'))
s[Y].bind(Y.op.axis[1], te.thread_axis('threadIdx.x'))  # tvm.lower does not detect double bind to threadIdx.x

# lower to CUDA
module = tvm.lower(s, [X, Y])
rt_mod = tvm.build(module, target='cuda')

# execute and check result
dev = tvm.cuda(0)
X_np = np.random.uniform(size=(I,J)).astype(np.float32)
Y_gold = np.zeros((I,J)).astype(np.float32)
for i in range(I):
    for j in range(J):
        Y_gold[i,j] = 2 * X_np[i,j]
X_tvm = tvm.nd.array(X_np, device=dev)
Y_tvm = tvm.nd.empty((I,J), device=dev)
rt_mod(X_tvm, Y_tvm)
print('result correct: ' + str(np.testing.assert_allclose(Y_tvm.asnumpy(), Y_gold, rtol=1e-3) is None))