TVM has a rather poor performance on non-divisible dimensions. For example, a 127x127x127 GEMM op.
import tvm
from tvm import te
import numpy as np
M = N = K = 127
dtype = "float32"
target = "llvm -mcpu=skylake-avx512"
device = tvm.device(target, 0)
A = te.placeholder((M, K), name="A")
B = te.placeholder((K, N), name="B")
k = te.reduce_axis((0, K), name="k")
C = te.compute((M, N), lambda i, j: te.sum( A[i, k] * B[k, j], axis=k), name="C")
s = te.create_schedule(C.op)
m, n = s[C].op.axis
k_axis = s[C].op.reduce_axis[0]
s[C].reorder(m, k_axis, n)
no, ni = s[C].split(n, factor=64)
#s[C].vectorize(ni)
print(tvm.lower(s, [A, B, C], simple_mode=True))
with tvm.target.create(target):
func = tvm.build(s, [A, B, C])
print(func.get_source("asm"))
evaluator = func.time_evaluator(func.entry_name, device, number=100)
a = tvm.nd.array(np.random.rand(M, K).astype(dtype), device)
b = tvm.nd.array(np.random.rand(K, N).astype(dtype), device)
c = tvm.nd.array(np.random.rand(M, N).astype(dtype), device)
print("time: %f ms, GFLOPS: %f" % (evaluator(a, b, c).mean * 1000, 2 * M * N * K / evaluator(a, b, c).mean / 1e9))
That’s mainly because “vectorize” primitive doens’t take effect on a non-divisible split, in this case, which is “ni”.
Do we have any plan to support such vectorization on non-divisible split dimension?