I use the script to tune the performance of vgg-16 network on V100.
After the tuning process has finished, I evaluated the average inference time. When batch size is 128, the inference time is 123ms. But if I use cudnn as the inference backend, the inference time is 88ms. Is this reasonable?
and I profiled the network layer by layer, some layer cost 6ms-24ms. and the costed layer uses winograd algorithm. so tvm has implement winograd kernels for int8 inference?
the following is my tuning result
{"input": ["cuda -model=unknown", "dense_large_batch.cuda", [["TENSOR", [128, 4096], "float32"], ["TENSOR", [1000, 4096], "float32"], null, "float32"], {}], "config": {"index": 771496, "cod
e_hash": null, "entity": [["tile_x", "sp", [-1, 2, 4, 1]], ["tile_y", "sp", [-1, 2, 4, 1]], ["tile_k", "sp", [-1, 8, 2]]]}, "result": [[0.000854268221590909], 0, 1.6328115463256836, 1584199
993.2121232], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "dense_small_batch.cuda", [["TENSOR", [128, 4096], "float32"], ["TENSOR", [1000, 4096], "float32"], null, "float32"], {}], "config": {"index": 5, "code_has
h": null, "entity": [["tile_k", "sp", [-1, 32]]]}, "result": [[0.0023226218153846153], 0, 1.3888931274414062, 1584201048.038974], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "dense_large_batch.cuda", [["TENSOR", [128, 4096], "float32"], ["TENSOR", [4096, 4096], "float32"], null, "float32"], {}], "config": {"index": 2023458, "co
de_hash": null, "entity": [["tile_x", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 4, 4, 1]], ["tile_k", "sp", [-1, 2, 8]]]}, "result": [[0.0010716840142857142], 0, 1.9444470405578613, 15842
02598.4111032], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "dense_small_batch.cuda", [["TENSOR", [128, 4096], "float32"], ["TENSOR", [4096, 4096], "float32"], null, "float32"], {}], "config": {"index": 9, "code_has
h": null, "entity": [["tile_k", "sp", [-1, 512]]]}, "result": [[0.00977129945], 0, 1.507237195968628, 1584203706.1801147], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "dense_large_batch.cuda", [["TENSOR", [128, 25088], "float32"], ["TENSOR", [4096, 25088], "float32"], null, "float32"], {}], "config": {"index": 7647258, "
code_hash": null, "entity": [["tile_x", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 4, 4, 1]], ["tile_k", "sp", [-1, 1, 16]]]}, "result": [[0.0065043505625], 0, 2.2584891319274902, 15842047
03.5166712], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "dense_small_batch.cuda", [["TENSOR", [128, 25088], "float32"], ["TENSOR", [4096, 25088], "float32"], null, "float32"], {}], "config": {"index": 22, "code_
hash": null, "entity": [["tile_k", "sp", [-1, 896]]]}, "result": [[0.06021745125], 0, 4.591594219207764, 1584206158.1411397], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 512, 14, 14], "int8"], ["TENSOR", [512, 512, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "c
onfig": {"index": 3764134, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_x", "sp", [-1, 2, 16, 2]], ["tile_rc", "sp", [-1, 16]], ["a
uto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 0]]}, "result": [[0.00474776134375], 0, 1.705704927444458, 1584209204.3883464], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_NCHWc_int8.cuda", [["TENSOR", [128, 512, 14, 14], "int8"], ["TENSOR", [512, 512, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW", "int32"], {}
], "config": {"index": 13276589889, "code_hash": null, "entity": [["tile_n", "sp", [-1, 2, 2, 1]], ["tile_f", "sp", [-1, 1, 4, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1,
14, 1]], ["fuse_yx", "ot", 1], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["reorder_inner", "re", [1, 0, 2]], ["AA_double_buffer", "ot", 1], ["WW_d
ouble_buffer", "ot", 0], ["auto_unroll_max_step", "ot", 1500]]}, "result": [[0.002698205803571429], 0, 1.6985657215118408, 1584212343.3395205], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 512, 28, 28], "int8"], ["TENSOR", [512, 512, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "c
onfig": {"index": 11647419, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 16]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 16]], [
"auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[0.0186427012], 0, 2.7832956314086914, 1584216768.1764288], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_NCHWc_int8.cuda", [["TENSOR", [128, 512, 28, 28], "int8"], ["TENSOR", [512, 512, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW", "int32"], {}], "config": {"index": 62328976330, "code_hash": null, "entity": [["tile_n", "sp", [-1, 4, 2, 1]], ["tile_f", "sp", [-1, 2, 4, 1]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 4, 1]], ["fuse_yx", "ot", 1], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["reorder_inner", "re", [0, 1, 2]], ["AA_double_buffer", "ot", 1], ["WW_double_buffer", "ot", 1], ["auto_unroll_max_step", "ot", 512]]}, "result": [[0.0094804296], 0, 2.0820913314819336, 1584220699.3217854], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 256, 28, 28], "int8"], ["TENSOR", [512, 256, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "config": {"index": 10679419, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 16]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[0.0101890489], 0, 2.056152820587158, 1584223572.0578058], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_NCHWc_int8.cuda", [["TENSOR", [128, 256, 28, 28], "int8"], ["TENSOR", [512, 256, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW", "int32"], {}], "config": {"index": 87419540302, "code_hash": null, "entity": [["tile_n", "sp", [-1, 2, 8, 1]], ["tile_f", "sp", [-1, 1, 4, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 4, 1]], ["fuse_yx", "ot", 0], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["reorder_inner", "re", [0, 2, 1]], ["AA_double_buffer", "ot", 1], ["WW_double_buffer", "ot", 1], ["auto_unroll_max_step", "ot", 1500]]}, "result": [[0.00475415871875], 0, 2.022735834121704, 1584230640.6202025], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 256, 56, 56], "int8"], ["TENSOR", [256, 256, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "c
onfig": {"index": 7646572, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 16]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["a
uto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}, "result": [[0.0130612322], 0, 3.975372552871704, 1584235044.9110425], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_NCHWc_int8.cuda", [["TENSOR", [128, 256, 56, 56], "int8"], ["TENSOR", [256, 256, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW", "int32"], {}
], "config": {"index": 237551421360, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_f", "sp", [-1, 1, 16, 1]], ["tile_y", "sp", [-1, 2, 1, 4]], ["tile_x", "sp", [-1,
1, 8, 1]], ["fuse_yx", "ot", 0], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["reorder_inner", "re", [2, 1, 0]], ["AA_double_buffer", "ot", 0], ["WW_
double_buffer", "ot", 1], ["auto_unroll_max_step", "ot", 1500]]}, "result": [[0.00923989125], 0, 1.872157096862793, 1584241014.4781325], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 128, 56, 56], "int8"], ["TENSOR", [256, 128, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "c
onfig": {"index": 15632531, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 8, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["a
uto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[0.00784787015], 0, 3.945775032043457, 1584243125.3562002], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 128, 112, 112], "int8"], ["TENSOR", [128, 128, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "config": {"index": 18803361, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 8, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[0.0166608162], 0, 4.897033214569092, 1584250572.05043], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_NCHWc_int8.cuda", [["TENSOR", [128, 128, 112, 112], "int8"], ["TENSOR", [128, 128, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW", "int32"], {}], "config": {"index": 390429495240, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 1]], ["tile_f", "sp", [-1, 1, 8, 1]], ["tile_y", "sp", [-1, 4, 1, 2]], ["tile_x", "sp", [-1, 1, 16, 1]], ["fuse_yx", "ot", 0], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["reorder_inner", "re", [0, 2, 1]], ["AA_double_buffer", "ot", 0], ["WW_double_buffer", "ot", 1], ["auto_unroll_max_step", "ot", 1500]]}, "result": [[0.009707265900000001], 0, 2.054020404815674, 1584254153.2729812], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 64, 112, 112], "int8"], ["TENSOR", [128, 64, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "config": {"index": 2641779, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 8]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 64]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}, "result": [[0.0108518186], 0, 3.3802666664123535, 1584259767.1725311], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_NCHWc_int8.cuda", [["TENSOR", [128, 64, 112, 112], "int8"], ["TENSOR", [128, 64, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW", "int32"], {}], "config": {"index": 227896873596, "code_hash": null, "entity": [["tile_n", "sp", [-1, 1, 1, 2]], ["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 16, 1]], ["fuse_yx", "ot", 0], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["reorder_inner", "re", [0, 2, 1]], ["AA_double_buffer", "ot", 1], ["WW_double_buffer", "ot", 1], ["auto_unroll_max_step", "ot", 512]]}, "result": [[0.00540158721875], 0, 2.051076650619507, 1584263114.3485994], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 64, 224, 224], "int8"], ["TENSOR", [64, 64, 3, 3], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "int32"], {}], "config": {"index": 14594989, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_x", "sp", [-1, 8, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}, "result": [[0.02666643035], 0, 4.514398574829102, 1584269255.3867397], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw_winograd.cuda", [["TENSOR", [128, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 3785449, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_x", "sp", [-1, 16, 32, 1]], ["tile_rc", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}, "result": [[0.01378827145], 0, 3.3212788105010986, 1584277466.8299236], "version": 0.2, "tvm_version": "0.7.dev1"}
{"input": ["cuda -model=unknown", "conv2d_nchw.cuda", [["TENSOR", [128, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 3, 3], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"], {}], "config": {"index": 199135959, "code_hash": null, "entity": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 2, 2, 2]], ["tile_x", "sp", [-1, 1, 32, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}, "result": [[0.0024414313263157895], 0, 2.1771702766418457, 1584280846.783762], "version": 0.2, "tvm_version": "0.7.dev1"}
sorry, I have made a mistake. the performance of cudnn is not correct. Inference with cudnn cost 88ms on V100. tvm is a little slower than cudnn. but this is reasonable.