Hi everyone.
I’m new to TVM and I’m trying to tune a simple onnx model using auto_scheduler.TaskScheduler
on my CPU. The tuning process doesn’t finish even after several hours and doesn’t either return me any logs. The model has a simple structure so I assume tuning it should require a lot of computation.
The code for the model:
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
net.load_state_dict(torch.load(PATH))
batch_size = 4
import torch.onnx
net.train(False)
x = torch.randn(batch_size, 3, 32, 32, requires_grad=True)
torch_out = torch.onnx._export(net, # model being run
x, # model input (or a tuple for multiple inputs)
"testModel1.onnx", # where to save the model (can be a file or file-like object)
export_params=True)
The code for the tuning :
if __name__ == '__main__':
target = "llvm"
network = "testModel1"
layout = "NCHW"
input_name = "input.1"
# shape_dict = {input_name: images.shape}
shape_dict = {input_name: torch.Size([batch_size, 3, 32, 32])}
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target)
print(log_file)
print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
for idx, task in enumerate(tasks):
print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))
print(task.compute_dag)
def run_tuning():
print("Begin tuning...")
tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=1, # change this to 20000 to achieve the best performance
runner=auto_scheduler.LocalRunner(repeat=1, enable_cpu_cache_flush=True),
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)
tuner.tune(tune_option)
run_tuning()
# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)
# Create graph executor
dev = tvm.device(target, 0)
module = graph_executor.GraphModule(lib["default"](dev))
data_tvm = tvm.nd.array((np.random.uniform(size=torch.Size([batch_size, 3, 32, 32]))).astype("float32"))
module.set_input("input.1", data_tvm)
# Evaluate
print("Evaluate inference time cost...")
print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
my system: -Processor: Inter(R) Core™ i5 CPU 750 @ 2.67GHz 2.67 GHz -Installed RAM: 16.0 GB -System type: 64-bit operating system, x64-based processor