Here is my code to tuning vector_add via AutoScheduler:
@auto_scheduler.register_workload
def vector_add(N):
data1 = te.placeholder((N,), name="data1")
data2 = te.placeholder((N,), name="data2")
vadd = te.compute(data1.shape, lambda i : data1[i] + data2[i], name="vadd")
return data1, data2, vadd
target = tvm.target.Target("cuda")
N = 256
task = auto_scheduler.SearchTask(
func=vector_add, args=(N,), target=target
)
log_file = "vector_add.json"
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=1000, # change this to 1000 to achieve the best performance
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
verbose=2,
)
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_file)
# Kill the measurement process
del measure_ctx
func = tvm.build(sch, args, target)
data1 = np.random.uniform(size=(N,)).astype(np.float32)
data2 = np.random.uniform(size=(N,)).astype(np.float32)
out = np.zeros((N,), dtype="float32")
dev = tvm.cuda()
tdata1 = tvm.nd.array(data1, device=dev)
tdata2 = tvm.nd.array(data2, device=dev)
tout = tvm.nd.empty(out.shape, device=dev)
func(tdata1, tdata2, tout)
print(
"Execution time of this operator: %.3f ms"
% (np.median(evaluator(tdata1, tdata2, tout).results) * 1000)
)
print(task.print_best(log_file, print_mode="cuda"))
Finally, I get a code by auto-scheduling, but it looks don’t any optimization:
extern "C" __global__ void __launch_bounds__(8) default_function_kernel0(float* __restrict__ vadd, float* __restrict__ data1, float* __restrict__ data2) {
vadd[((((int)blockIdx.x) * 8) + ((int)threadIdx.x))] = (data1[((((int)blockIdx.x) * 8) + ((int)threadIdx.x))] + data2[((((int)blockIdx.x) * 8) + ((int)threadIdx.x))]);
}