Testing/densenet implementation has problems

When testing DensetNet-121 using the built-in function in tvm, I saw unrealistic kernels below example(channel size is wrong!).

I found some problems with the built-in implementation of densenet

It is an AutoTvm task when creating densetnet121. You can see the input and output channels are weird.

import tvm
import tvm.relay.testing

def get_network(name, batch_size):
    """Get the symbol definition and random weight of a network"""
    input_shape = (batch_size, 3, 224, 224)
    output_shape = (batch_size, 1000)
    if "densenet" in name:
        n_layer = int(name.split('-')[1])
        mod, params = relay.testing.densenet.get_workload(densenet_size=n_layer,batch_size=batch_size, dtype=dtype)
    else:
        raise ValueError("Unsupported network: " + name)
    return mod, params, input_shape, output_shape
target = tvm.target.cuda()

dtype = "float32"
mod, params, input_shape, out_shape = get_network('densenet-121', batch_size=1)
tasks = autotvm.task.extract_from_program(
    mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
)
for task in reversed(tasks):
    print(task)
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 7, 7), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 7, 7), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (32, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (32, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (512, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (512, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 14, 14), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (32, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (32, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (256, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (256, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 28, 28), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (128, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (128, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 1, 56, 56), 'float32'), ('TENSOR', (32, 1, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (1, 32, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (32, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (32, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'))

Problem1

The call of the arguments was incorrect.

The order of growth_rate,bn_size should be changed.

Solution1

    for i, num_layers in enumerate(block_config):
        layer_out = _make_dense_block(layer_out, num_layers, bn_size,growth_rate,  i)
        num_features = num_features + num_layers * growth_rate

Problem2

The implementation of mxnet describes bn_size to “Multiplicative factor for number of bottle neck layers”.So it is not batch_size. So it must be changed get_workload function. May be addition variable(bn_size) is a simple solution.

Solution2

 bn_size = 4
    net = _make_dense_net(
        num_init_features, growth_rate, block_config, data_shape, dtype, bn_size, classes
    )

Problem 3

It hasn’t been implemented about concatenate operation:(

Solution3

def _make_dense_block(data, num_layers, bn_size, growth_rate, index):
    """Makes a block of dense layers of the specified size."""
    layer_out = data
    block = []
    for i in range(num_layers):
        layer_out = _make_dense_layer(layer_out, growth_rate, bn_size, "%s_%s" % (index, i))
        block.append(layer_out)
    out = relay.concatenate(block, 1)
    return out

Environment

TVM: commit 2d1847c9d3ce70daed518d8b3d9dbf750ae34672 CUDA version: 10.2 System: Ubuntu 20.04 GCC 7.5 Build options: -DUSE_LLVM=ON -DUSE_CUDA=ON

This is the autotvm task output when applying the 3 solutions.

Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 3, 224, 224), 'float32'), ('TENSOR', (64, 3, 7, 7), 'float32'), (2, 2), (3, 3, 3, 3), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (128, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 64, 56, 56), 'float32'), ('TENSOR', (128, 64, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 56, 56), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 56, 56), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 192, 56, 56), 'float32'), ('TENSOR', (128, 192, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 192, 56, 56), 'float32'), ('TENSOR', (128, 192, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (128, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (128, 128, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 28, 28), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 28, 28), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 384, 28, 28), 'float32'), ('TENSOR', (256, 384, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 384, 28, 28), 'float32'), ('TENSOR', (256, 384, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (128, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 256, 14, 14), 'float32'), ('TENSOR', (128, 256, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 14, 14), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 14, 14), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 768, 14, 14), 'float32'), ('TENSOR', (512, 768, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 768, 14, 14), 'float32'), ('TENSOR', (512, 768, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (128, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 512, 7, 7), 'float32'), ('TENSOR', (128, 512, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw_winograd.cuda, args=(('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw_winograd.cuda', ('TENSOR', (1, 128, 7, 7), 'float32'), ('TENSOR', (32, 128, 3, 3), 'float32'), (1, 1), (1, 1, 1, 1), (1, 1), 'float32'))
Task(func_name=conv2d_nchw.cuda, args=(('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'), kwargs={}, workload=('conv2d_nchw.cuda', ('TENSOR', (1, 32, 7, 7), 'float32'), ('TENSOR', (128, 32, 1, 1), 'float32'), (1, 1), (0, 0, 0, 0), (1, 1), 'float32'))