Hello, I implemented a blur op to compare tvm and halide. The speed of tvm and halide are very close on x86 platform (0.6 ms vs 0.5 ms). But blur_tvm is much slower than blur_halide : 4.18 ms vs 1.42 ms. I am confused by the results. Is there something wrong ? Here is the code.
import tvm
from tvm.ir.module import IRModule
from tvm.script import tir as T
import numpy as np
from tvm import auto_scheduler
from tvm import te
import os
from tvm.script import tir as T
M = 2568
N = 1922
# The default tensor type in tvm
dtype = "uint16"
target = "llvm -mcpu=skylake-avx512"
# target = "llvm"
dev = tvm.device(target, 0)
# # Algorithm
k1 = te.reduce_axis((0, 3), "k1")
k2 = te.reduce_axis((0, 3), "k2")
A = te.placeholder((M, N), name="A", dtype=dtype)
B = te.compute((M-8, N-2), lambda m, n: te.sum(A[m+k1, n+k2], axis=(k1, k2)), name="B")
C = te.compute((M-8, N-2), lambda m, n: te.div(B[m, n], tvm.tir.const(9, dtype=dtype)), name="C")
func = te.create_prim_func([A, C])
func = func.with_attr("global_symbol", "main")
ir_module = IRModule({"main": func})
print(ir_module.script())
sch = tvm.tir.Schedule(ir_module)
block_b = sch.get_block("B")
bx, by, bk1, bk2 = sch.get_loops(block_b)
bxo, bxi = sch.split(bx, (None, 32))
byo, byi = sch.split(by, (None, 32))
block_c = sch.get_block("C")
cx, cy = sch.get_loops(block_c)
cxo, cxi = sch.split(cx, (None, 32))
cyo, cyi = sch.split(cy, (None, 16))
sch.compute_at(block_b, cyo)
block_b = sch.get_block("B")
xo, xi, yo, yi, bk1, bk2 = sch.get_loops(block_b)
sch.fuse(bk1, bk2)
sch.parallel(xo)
sch.vectorize(yi)
print(sch.mod.script())
func = tvm.build(sch.mod, target=target) # The module for CPU backends.
inp = tvm.nd.array(np.ones((M, N)).astype(dtype), dev)
out = tvm.nd.array(np.zeros((M-8, N-2), dtype=dtype), dev)
func(inp, out)
evaluator = func.time_evaluator(func.entry_name, dev, number=10)
print("after transformation: %f" % evaluator(inp, out).mean)
import os
ta = tvm.target.Target("llvm -mtriple=arm64-linux-android, -mattr=+neon")
os.environ["TVM_NDK_CC"] = "/home/zzz/Android/sdk/ndk/21.1.6352462/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang++"
lib = tvm.build(sch.mod, target=ta, name="blur")
from tvm.contrib import ndk
lib.export_library(
"/home/zzz/tvm/tmp/blur_arm.so",
ndk.create_shared, ["-shared", "-fPIC", "-lm", "-O2", "-std=c++17"]
)