Tuning doesn't finish even after several hours

Ali-arch808 · February 12, 2022, 10:11am

Hi everyone. I’m new to TVM and I’m trying to tune a simple onnx model using auto_scheduler.TaskScheduler on my CPU. The tuning process doesn’t finish even after several hours and doesn’t either return me any logs. The model has a simple structure so I assume tuning it should require a lot of computation. The code for the model:

      class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
net.load_state_dict(torch.load(PATH))

batch_size = 4
import torch.onnx
net.train(False)
x = torch.randn(batch_size, 3, 32, 32, requires_grad=True)

torch_out = torch.onnx._export(net,                     # model being run
                               x,                       # model input (or a tuple for multiple inputs)
                               "testModel1.onnx",       # where to save the model (can be a file or file-like object)
                               export_params=True)

The code for the tuning :

 if __name__ == '__main__':

    target = "llvm"
    network = "testModel1"
    layout = "NCHW"
    input_name = "input.1"
    # shape_dict = {input_name: images.shape}
    shape_dict = {input_name: torch.Size([batch_size, 3, 32, 32])}

    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

    log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target)
    print(log_file)
    print("Extract tasks...")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

    for idx, task in enumerate(tasks):
       print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
       print(task.compute_dag)


    def run_tuning():
       print("Begin tuning...")
       tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
       tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=1,  # change this to 20000 to achieve the best performance
        runner=auto_scheduler.LocalRunner(repeat=1, enable_cpu_cache_flush=True),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)


    run_tuning()

    # Compile with the history best
    print("Compile...")
    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(mod, target=target, params=params)

    # Create graph executor
    dev = tvm.device(target, 0)
    module = graph_executor.GraphModule(lib["default"](dev))
    data_tvm = tvm.nd.array((np.random.uniform(size=torch.Size([batch_size, 3, 32,     32]))).astype("float32"))
    module.set_input("input.1", data_tvm)

    # Evaluate
    print("Evaluate inference time cost...")
    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))

my system: -Processor: Inter(R) Core™ i5 CPU 750 @ 2.67GHz 2.67 GHz -Installed RAM: 16.0 GB -System type: 64-bit operating system, x64-based processor