Prorgam with no schedule:
import time
import tvm
from tvm import relax
from tvm.script import ir as I
from tvm.script import relax as R
from tvm.script import tir as T
@I.ir_module
class Module:
@T.prim_func
def dense_loop(
VAL: T.handle,
VEC: T.handle,
OUT: T.handle
):
val = T.match_buffer(VAL, (25000000,), "float64")
vec = T.match_buffer(VEC, (5000,), "float64")
out = T.match_buffer(OUT, (5000,), "float64")
for i in T.serial(5000):
out[i] = 0.0
for j in T.serial(5000):
for i in T.serial(5000):
with T.block("db0"):
T.init()
out[i + 0] += val[0 + j * 5000 + i] * vec[j + 0]
@R.function
def main(vec: R.Tensor(("k",), dtype="float64"),val: R.Tensor(("v",), dtype="float64")):
cls = Module
out1 = R.call_tir(cls.dense_loop, (val, vec), out_sinfo=R.Tensor((5000,), dtype="float64"))
return out1
if __name__ == "__main__":
...
target = tvm.target.Target("llvm -num-cores 1")
mod = Module
ex = relax.build(mod, target=target)
vm = relax.VirtualMachine(ex, tvm.cpu())
times = []
for i in range(1001):
time1 = time.time_ns()
out = vm["main"](vec_arg, val_arg)
time2 = time.time_ns()
times.append((time2 - time1))
avg_time = sum(times) / len(times)
print(f"{avg_time}")
The autoscheduled variant is created by replacing the line: mod = Module
with mod = relax.get_pipeline("static_shape_tuning", target=target, total_trials=10000)(Module)
Program without any schedules gives the time as 18830545 ns
and autoscheduled program gives the time 28224987 ns
. Why is that the case? The loop seems large enough.
I have a follow up question: It seems like adding schedules manually like:
sch = tvm.tir.Schedule(mod)
block = sch.get_block("db0", func_name="dense_loop")
j,i = sch.get_loops(block)
sch.vectorize(i)
mod = sch.mod
also result in slow-downs, as opposed to speedups. What could be happening here?