Undetected type issue

Just realized that TVM does not detect the following type issue (shape of B matrix is incompatible with size of iteration space) and thus generates incorrect CUDA code without issuing error messages:

# express C[i,j] = A[i,j] * B[j,i]
A = te.placeholder((I,J), dtype='float32')
B = te.placeholder((I,J), dtype='float32')  # tvm.lower does not recognize incorrect size here (should be (J,I))
C = te.compute(
    (I,J),
    lambda i,j: A[i,j] * B[j,i]
)

Complete example:

import tvm
from tvm import te
import numpy as np

# define input size
I = 16
J = 32

# express C[i,j] = A[i,j] * B[j,i]
A = te.placeholder((I,J), dtype='float32')
B = te.placeholder((I,J), dtype='float32')  # tvm.lower does not detect incorrect size here (should be (J,I))
C = te.compute(
    (I,J),
    lambda i,j: A[i,j] * B[j,i]
)

# create simple CUDA schedule
s = te.create_schedule([C.op])
s[C].bind(C.op.axis[0], te.thread_axis('threadIdx.x'))

# lower to CUDA
module = tvm.lower(s, [A, B, C])
rt_mod = tvm.build(module, target='cuda')

# execute and check result
dev = tvm.cuda(0)
A_np = np.random.uniform(size=(I,J)).astype(np.float32)
B_np = np.random.uniform(size=(I,J)).astype(np.float32)
C_gold = np.zeros((I,J)).astype(np.float32)
for i in range(I):
    for j in range(J):
        C_gold[i,j] = A_np[i,j] * B_np.flatten()[j * I + i]
A_tvm = tvm.nd.array(A_np, device=dev)
B_tvm = tvm.nd.array(B_np, device=dev)
C_tvm = tvm.nd.empty((I,J), device=dev)
rt_mod(A_tvm, B_tvm, C_tvm)
print('result correct: ' + str(np.testing.assert_allclose(C_tvm.asnumpy(), C_gold, rtol=1e-3) is None))