When testing DensetNet-121 using the built-in function in tvm, I saw unrealistic kernels below example(channel size is wrong!).
I found some problems with the built-in implementation of densenet
It is an AutoTvm task when creating densetnet121. You can see the input and output channels are weird.
import tvm
import tvm.relay.testing
def get_network(name, batch_size):
"""Get the symbol definition and random weight of a network"""
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)
if "densenet" in name:
n_layer = int(name.split('-')[1])
mod, params = relay.testing.densenet.get_workload(densenet_size=n_layer,batch_size=batch_size, dtype=dtype)
else:
raise ValueError("Unsupported network: " + name)
return mod, params, input_shape, output_shape
target = tvm.target.cuda()
dtype = "float32"
mod, params, input_shape, out_shape = get_network('densenet-121', batch_size=1)
tasks = autotvm.task.extract_from_program(
mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
)
for task in reversed(tasks):
print(task)
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 7, 7), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 7, 7), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (32, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (32, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (512, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (512, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (32, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (32, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (256, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (256, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (128, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (128, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (32, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (32, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'))
Problem1
The call of the arguments was incorrect.
The order of growth_rate,bn_size should be changed.
Solution1
for i, num_layers in enumerate(block_config):
layer_out = _make_dense_block(layer_out, num_layers, bn_size,growth_rate, i)
num_features = num_features + num_layers * growth_rate
Problem2
The implementation of mxnet describes bn_size to “Multiplicative factor for number of bottle neck layers”.So it is not batch_size.
So it must be changed get_workload
function.
May be addition variable(bn_size) is a simple solution.
Solution2
bn_size = 4
net = _make_dense_net(
num_init_features, growth_rate, block_config, data_shape, dtype, bn_size, classes
)
Problem 3
It hasn’t been implemented about concatenate operation:(
Solution3
def _make_dense_block(data, num_layers, bn_size, growth_rate, index):
"""Makes a block of dense layers of the specified size."""
layer_out = data
block = []
for i in range(num_layers):
layer_out = _make_dense_layer(layer_out, growth_rate, bn_size, "%s_%s" % (index, i))
block.append(layer_out)
out = relay.concatenate(block, 1)
return out
Environment
TVM: commit 2d1847c9d3ce70daed518d8b3d9dbf750ae34672 CUDA version: 10.2 System: Ubuntu 20.04 GCC 7.5 Build options: -DUSE_LLVM=ON -DUSE_CUDA=ON
This is the autotvm task output when applying the 3 solutions.
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (128, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (128, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 192, 56, 56), 'float32'), ('TENSOR', (128, 192, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 192, 56, 56), 'float32'), ('TENSOR', (128, 192, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (128, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (128, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 384, 28, 28), 'float32'), ('TENSOR', (256, 384, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 384, 28, 28), 'float32'), ('TENSOR', (256, 384, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (128, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (128, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 768, 14, 14), 'float32'), ('TENSOR', (512, 768, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 768, 14, 14), 'float32'), ('TENSOR', (512, 768, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (128, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (128, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))