As following code shows, we have three [1x1 Convolution] and one [3x3 Convolution] sequentially connected. Giving batch size 32, input channel 64, output channel 128 and feature map 32x32. The auto scheduler tuner spend ~500 seconds on my CPU to find the first schedule, and by changing the workload size to even larger, the time consuming is kind of unbearable. I’m wondering whether could put a timeout parameter or it’s something buggy in the sketch generation stage.
import tvm
from tvm import auto_scheduler, te, topi
# 3x [1x1 Convolution], 1x [3x3 Convolution]
@auto_scheduler.register_workload
def buggy_func(n: int, ic: int, oc: int, k: int, h: int, w: int):
X_1 = te.placeholder((n, ic, h, w))
W_1 = te.placeholder((ic, ic, 1, 1))
X_2 = topi.nn.conv2d(X_1, W_1, (1, 1), (0, 0), (1, 1))
W_2 = te.placeholder((ic, ic, 1, 1))
X_3 = topi.nn.conv2d(X_2, W_2, (1, 1), (0, 0), (1, 1))
W_3 = te.placeholder((ic, ic, 1, 1))
X_4 = topi.nn.conv2d(X_3, W_3, (1, 1), (0, 0), (1, 1))
W_4 = te.placeholder((oc, ic, k, k))
X_5 = topi.nn.conv2d(X_4, W_4, (1, 1), (0, 0), (1, 1))
return [X_1, W_1, W_2, W_3, W_4, X_5]
if __name__ == '__main__':
target = tvm.target.Target(target='cuda', host='llvm')
task = auto_scheduler.SearchTask(func=buggy_func, args=(32, 64, 128, 3, 32, 32), target=target)
tune_options = auto_scheduler.TuningOptions(
num_measure_trials=256,
num_measures_per_round=64,
verbose=2,
)
task.tune(tune_options)