Output tensor layout is NCHWc instead of NCHW when AlterOpLayout is enabled

When I tested a simple network conv2d + sigmoid with AlterOpLayout enabled, the output tensor layout turned out to be NCHWc instead of NCHW (the layout of the input tensor). Is this the expected behavior? If not, how can this be fixed?

This is my code:

from __future__ import absolute_import as _abs
import numpy as np
from nnvm import symbol as sym
from nnvm.top import registry as reg
from nnvm.testing import utils
import nnvm.compiler
import nnvm.graph as graph

import tvm
from tvm.contrib import graph_runtime

def test_alter_op_layout():
    input_name = "data"
    input_shape = (1, 3, 224, 224)
    data = sym.Variable(input_name, shape=input_shape)
    conv = sym.conv2d(data, name="conv", channels=3,
                     kernel_size=(3,3), padding=(1,1),
                     use_bias=True, layout="NCHW")
    sigmoid = sym.sigmoid(conv)
    batch_size = 1
    net, params = utils.create_workload(sigmoid, batch_size, (3, 224, 224))
    
    opt_level = 3
    target = 'llvm -mcpu=core-avx2'
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(
            net, target, shape={input_name: input_shape}, params=params)
        print(graph.symbol().debug_str())
    
    ctx = tvm.context(target, 0)
    dtype = 'float32'
    m = graph_runtime.create(graph, lib, ctx)
    # set inputs
    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
    m.set_input(input_name, data_tvm)
    m.set_input(**params)
    # execute
    m.run()
    # get outputs
    # output_shape = input_shape # error when output_shape is 4D
    output_shape = (1, 1, 224, 224, 3)
    tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()
    print(tvm_output.shape)
        
if __name__ == "__main__":
    test_alter_op_layout()

This is the error I see when I set the output_shape=input_shape

---------------------------------------------------------------------------
TVMError                                  Traceback (most recent call last)
<ipython-input-8-5a505249d5ef> in <module>()
     44 
     45 if __name__ == "__main__":
---> 46     test_alter_op_layout()
     47 

<ipython-input-8-5a505249d5ef> in test_alter_op_layout()
     40 #     output_shape = (1, 1, 224, 224, 3)
     41     output_shape = input_shape
---> 42     tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()
     43     print(tvm_output.shape)
     44 

~/github/tvm/python/tvm/contrib/graph_runtime.py in get_output(self, index, out)
    176         """
    177         if out:
--> 178             self._get_output(index, out)
    179             return out
    180 

~/github/tvm/python/tvm/_ffi/_cython/function.pxi in tvm._ffi._cy3.core.FunctionBase.__call__()

~/github/tvm/python/tvm/_ffi/_cython/function.pxi in tvm._ffi._cy3.core.FuncCall()

~/github/tvm/python/tvm/_ffi/_cython/function.pxi in tvm._ffi._cy3.core.FuncCall3()

~/github/tvm/python/tvm/_ffi/_cython/base.pxi in tvm._ffi._cy3.core.CALL()

TVMError: [22:50:34] /Users/hlu/github/tvm/src/runtime/graph/graph_runtime.cc:151: Check failed: data->ndim == data_out->ndim (5 vs. 4) 

Stack trace returned 10 entries:
[bt] (0) 0   libtvm.dylib                        0x000000011a11c070 dmlc::StackTrace() + 288
[bt] (1) 1   libtvm.dylib                        0x000000011a11be0f dmlc::LogMessageFatal::~LogMessageFatal() + 47
[bt] (2) 2   libtvm.dylib                        0x000000011a718e5f tvm::runtime::GraphRuntime::CopyOutputTo(int, DLTensor*) + 527
[bt] (3) 3   libtvm.dylib                        0x000000011a718b81 std::__1::__function::__func<tvm::runtime::GraphRuntime::GetFunction(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::shared_ptr<tvm::runtime::ModuleNode> const&)::$_5, std::__1::allocator<tvm::runtime::GraphRuntime::GetFunction(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::shared_ptr<tvm::runtime::ModuleNode> const&)::$_5>, void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&) + 161
[bt] (4) 4   libtvm.dylib                        0x000000011a6e46a6 TVMFuncCall + 70
[bt] (5) 5   core.cpython-36m-darwin.so          0x000000011d92298d __pyx_f_3tvm_4_ffi_4_cy3_4core_FuncCall(void*, _object*, TVMValue*, int*) + 477
[bt] (6) 6   core.cpython-36m-darwin.so          0x000000011d928387 __pyx_pw_3tvm_4_ffi_4_cy3_4core_12FunctionBase_5__call__(_object*, _object*, _object*) + 55
[bt] (7) 7   Python                              0x000000010cae2e8c _PyObject_FastCallDict + 143
[bt] (8) 8   Python                              0x000000010cb7f0fa call_function + 441
[bt] (9) 9   Python                              0x000000010cb77ff7 _PyEval_EvalFrameDefault + 4811

yes, this is expected. If you want NCHW output, you can add a layout transform symbol manually (example). Or if you append softmax or other ops that require NCHW input at the end, layout transform will be inserted automatically and you will get NCHW output.

Thanks for the quick reply. I added two layout transform after the sigmoid layer like this:

    data = sym.Variable(input_name, shape=input_shape)
    conv = sym.conv2d(data, name="conv", channels=3,
                     kernel_size=(3,3), padding=(1,1),
                     use_bias=True, layout="NCHW")
    sigmoid = sym.sigmoid(conv)
    layout_transform = sym.__layout_transform__(data=sigmoid, src_layout="NCHW", dst_layout="NCHW3c")
    out = sym.__layout_transform__(data=layout_transform, src_layout="NCHW3c", dst_layout="NCHW")

After I compiled with tvm, the compiled graph looked like this:

Symbol Outputs:
	output[0]=__layout_transform__2(0)
Variable:data
--------------------
Op:tvm_op, Name=data_NCHW3c
Inputs:
	arg[0]=data(0) version=0
Attrs:
	flatten_data=0
	func_name=fuse___layout_transform___2
	num_inputs=1
	num_outputs=1
Variable:conv_weight_OIHW3i3o
Variable:conv_bias_C3c
--------------------
Op:tvm_op, Name=sigmoid8
Inputs:
	arg[0]=data_NCHW3c(0)
	arg[1]=conv_weight_OIHW3i3o(0) version=0
	arg[2]=conv_bias_C3c(0) version=0
Attrs:
	flatten_data=0
	func_name=fuse__contrib_conv2d_NCHWc_sigmoid_1
	num_inputs=3
	num_outputs=1
--------------------
Op:tvm_op, Name=__layout_transform__2
Inputs:
	arg[0]=sigmoid8(0)
Attrs:
	flatten_data=0
	func_name=fuse___layout_transform_____layout_transform_____layout_transform__
	num_inputs=1
	num_outputs=1

In the last op, does it eliminate the redundant code in the fusion step? I mean does it do NCHWc -> NCHW -> NCHWc -> NCHW or does it do NCHWc -> NCHW? How can I examine the generated code?

you only need one layout transform. This should work.

out = sym.__layout_transform__(data=sigmoid, src_layout="NCHW3c", dst_layout="NCHW")

You can see the pseudo code-like dump by logging.basicConfig(level=logging.DEBUG)

I tried to add one layout transform first and it didn’t work because the shape inference didn’t work. Here is the error:

---------------------------------------------------------------------------
NNVMError                                 Traceback (most recent call last)
<ipython-input-1-7625b5290f8c> in <module>()
     47 
     48 if __name__ == "__main__":
---> 49     test_alter_op_layout()
     50 

<ipython-input-1-7625b5290f8c> in test_alter_op_layout()
     22 
     23     batch_size = 1
---> 24     net, params = utils.create_workload(out, batch_size, (3, 224, 224))
     25 
     26     opt_level = 3

~/github/tvm/nnvm/python/nnvm/testing/utils.py in create_workload(net, batch_size, image_shape, dtype, initializer, seed)
     45     params = {}
     46     g = graph.create(net)
---> 47     input_shapes, _ = graph_util.infer_shape(g, data=data_shape)
     48     shape_dict = dict(zip(g.index.input_names, input_shapes))
     49     np.random.seed(seed)

~/github/tvm/nnvm/python/nnvm/compiler/graph_util.py in infer_shape(graph, **shape)
     29     """
     30     graph = graph_attr.set_shape_inputs(graph, shape)
---> 31     graph = graph.apply("InferShape")
     32     shape = graph.json_attr("shape")
     33     index = graph.index

~/github/tvm/nnvm/python/nnvm/graph.py in apply(self, passes)
    232         ghandle = GraphHandle()
    233         npass = nn_uint(len(passes))
--> 234         check_call(_LIB.NNGraphApplyPasses(self.handle, npass, cpass, ctypes.byref(ghandle)))
    235         return Graph(ghandle)
    236 

~/github/tvm/nnvm/python/nnvm/_base.py in check_call(ret)
     73     """
     74     if ret != 0:
---> 75         raise NNVMError(py_str(_LIB.NNGetLastError()))
     76 
     77 def c_str(string):

NNVMError: [15:24:21] /Users/hlu/github/tvm/nnvm/src/top/image/../nn/nn_common.h:68: Check failed: src_factor == src[src_minor_pos] (3 vs. 0) src shape [1,3,224,224] does not agree with layout NCHW3c

Stack trace returned 10 entries:
[bt] (0) 0   libnnvm_compiler.dylib              0x00000001157c1af0 dmlc::StackTrace() + 288
[bt] (1) 1   libnnvm_compiler.dylib              0x00000001157c188f dmlc::LogMessageFatal::~LogMessageFatal() + 47
[bt] (2) 2   libnnvm_compiler.dylib              0x0000000115890c80 nnvm::top::ConvertLayout(nnvm::TShape, nnvm::Layout const&, nnvm::Layout const&) + 2400
[bt] (3) 3   libnnvm_compiler.dylib              0x00000001158d9556 nnvm::top::LayoutTransformInferShape(nnvm::NodeAttrs const&, std::__1::vector<nnvm::TShape, std::__1::allocator<nnvm::TShape> >*, std::__1::vector<nnvm::TShape, std::__1::allocator<nnvm::TShape> >*) + 854
[bt] (4) 4   libnnvm_compiler.dylib              0x000000011585c7b4 nnvm::Graph nnvm::pass::(anonymous namespace)::InferAttr<nnvm::TShape, nnvm::pass::(anonymous namespace)::$_0::operator()(nnvm::Graph) const::'lambda'(nnvm::TShape const&), std::nullptr_t>(nnvm::Graph&&, nnvm::TShape, char const*, char const*, char const*, char const*, char const*, nnvm::pass::(anonymous namespace)::$_0::operator()(nnvm::Graph) const::'lambda'(nnvm::TShape const&), std::nullptr_t)::'lambda'(unsigned int, bool)::operator()(unsigned int, bool) const + 3156
[bt] (5) 5   libnnvm_compiler.dylib              0x000000011585a8fa std::__1::__function::__func<nnvm::pass::(anonymous namespace)::$_0, std::__1::allocator<nnvm::pass::(anonymous namespace)::$_0>, nnvm::Graph (nnvm::Graph)>::operator()(nnvm::Graph&&) + 3866
[bt] (6) 6   libnnvm_compiler.dylib              0x000000011583347f nnvm::ApplyPasses(nnvm::Graph, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) + 1407
[bt] (7) 7   libnnvm_compiler.dylib              0x00000001157bd7c6 NNGraphApplyPasses + 566
[bt] (8) 8   _ctypes.cpython-36m-darwin.so       0x00000001065e349f ffi_call_unix64 + 79
[bt] (9) 9   ???                                 0x00007ffeeb5974c0 0x0 + 140732846929088

The last layer “fuse___layout_transform_____layout_transform_____layout_transform__” actually generated the code I wanted, NCHWc -> NCHW. Very cool. Thanks very much for your help!

DEBUG:root:lower function fuse___layout_transform_____layout_transform_____layout_transform__
DEBUG:root:produce layout_transform {
  parallel (ax0.ax1.fused, 0, 3) {
    for (ax2, 0, 224) {
      for (ax3, 0, 224) {
        layout_transform[((((ax0.ax1.fused*224) + ax2)*224) + ax3)] = input0[((ax0.ax1.fused + (ax2*672)) + (ax3*3))]
      }
    }
  }
}