[TE/TIR] handling tensors with leading dimensions equal to 1?

Hello everyone,

I am facing a weird problem during TE->TIR transformation.

I wanted to create my own implementation of conv2d+bias_add for an accelerator (similar to VTA) as an exercise. The layout of the IFM is NHWC, the kernels are [HWOcIc] and the bias is [1,1,1,Oc].

The code I used follows. It has a lot of the same code as the conv2d tutorial of VTA. Look for the comment #HERE for what I think is the important parts.

import tvm
from tvm import te
import numpy as np

from tvm.contrib import utils

from tvm import topi

# 2D convolution layer dimensions taken from ResNet-18 architecture
# (9th convolutional layer)
batch_size = 1
height = 512
width = 256
in_channels = 32
out_channels = 64
kernel_h = 3
kernel_w = 3
stride_h = 1
stride_w = 1
pad_h = (kernel_h//2)*stride_h
pad_w = (kernel_w//2)*stride_w
# MaxPool2D
pool_h=2
pool_w=2
pool_type='max'
# Input feature map: (N,H,W,C)
data_shape = (batch_size ,
              height,
              width,
              in_channels)

# Kernel: (H,W,Oc,Ic)
kernel_shape = (kernel_h,
                kernel_w,
                out_channels,
                in_channels)

# Derive output feature map dimensions
fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1

# Output feature map: (N,H, W, C)
output_shape = (batch_size,
                fout_height,
                fout_width,
                out_channels)

# Convolution reduction axes
kh = te.reduce_axis((0, kernel_h), name='kh')
kw = te.reduce_axis((0, kernel_w), name='kw')
ic = te.reduce_axis((0, in_channels), name='ic')
#Compute rules

# Input placeholder tensors
data = te.placeholder(data_shape,
                       name="data",
                       dtype="int8")
kernel = te.placeholder(kernel_shape,
                         name="kernel",
                         dtype="int8")
bias = te.placeholder((1,1,1,out_channels), #HERE
                        name="bias",
                        dtype="int8")

# Copy buffers:
#   Apply spatial padding to input feature map
data_buf = topi.nn.pad(data,
                       [0, pad_h, pad_w, 0],
                       name="data_buf")
kernel_buf = te.compute(kernel_shape, lambda KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC: kernel[KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC], "kernel_buf")

bias_buf = te.compute((1,1,1,out_channels), lambda N,H,W,BiasBuff_C: bias[N,H,W,BiasBuff_C], "bias_buf")#HERE also bias[1,1,1,BiasBuff_C] doesnt work

# 2D convolution
res_conv = te.compute(
    output_shape,
    lambda CONV2D_N, CONV2D_H, CONV2D_W, CONV2D_C: te.sum(
      data_buf[CONV2D_N, CONV2D_H*stride_h+kh, CONV2D_W*stride_w+kw, ic].astype('int32') *
      kernel_buf[kh, kw, CONV2D_C, ic].astype('int32'),
    axis=[kh, kw, ic]),
    name="res_conv")

# Bias add
res_bias = te.compute ( output_shape, lambda BADD_N, BADD_H, BADD_W, BADD_C: res_conv[BADD_N, BADD_H, BADD_W, BADD_C]+ bias_buf[1,1,1,BADD_C], name='res_bias')


# Get the schedule
s = te.create_schedule(res_bias.op)
print(tvm.lower(s,[data,kernel,bias,res_bias]))

The problem I see is that there is no copy operation for the bias_buf from bias (placeholder). So that would mean that the array values are undefined. I am not really sure what is happening since it’s the same te.compute constructs as for the data_buff and kernel_buf, which do appear.

The output printed is following (again look for #HERE)

primfn(data_1: handle, kernel_1: handle, bias_1: handle, res_bias_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {bias: Buffer(bias_2: Pointer(int8), int8, [1, 1, 1, 64], []),
             res_bias: Buffer(res_bias_2: Pointer(int32), int32, [1, 512, 256, 64], []),
             data: Buffer(data_2: Pointer(int8), int8, [1, 512, 256, 32], []),
             kernel: Buffer(kernel_2: Pointer(int8), int8, [3, 3, 64, 32], [])}
  buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, res_bias_1: res_bias} {
  attr [data_buf: Pointer(int8)] "storage_scope" = "global";
  allocate(data_buf, int8, [4243584]);
  attr [kernel_buf: Pointer(int8)] "storage_scope" = "global";
  allocate(kernel_buf, int8, [18432]);
  attr [res_conv: Pointer(int32)] "storage_scope" = "global";
  allocate(res_conv, int32, [8388608]);
  attr [bias_buf: Pointer(int8)] "storage_scope" = "global";
  allocate(bias_buf, int8, [64]) {
    for (i1: int32, 0, 514) {
      for (i2: int32, 0, 258) {
        for (i3: int32, 0, 32) {
          data_buf[(((i1*8256) + (i2*32)) + i3)] = @tir.if_then_else(((((1 <= i1) && (i1 < 513)) && (1 <= i2)) && (i2 < 257)), (int8*)data_2[((((i1*8192) + (i2*32)) + i3) - 8224)], 0i8, dtype=int8)
        }
      }
    }
    for (KernBuff_H: int32, 0, 3) {
      for (KernBuff_W: int32, 0, 3) {
        for (KernBuff_OC: int32, 0, 64) {
          for (KernBuff_IC: int32, 0, 32) {
            kernel_buf[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)] = (int8*)kernel_2[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)]
          }
        }
      }
    }
    for (CONV2D_H: int32, 0, 512) {
      for (CONV2D_W: int32, 0, 256) {
        for (CONV2D_C: int32, 0, 64) {
          res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = 0
          for (ACC_kh: int32, 0, 3) {
            for (ACC_kw: int32, 0, 3) {
              for (ACC_ic: int32, 0, 32) {
                res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = ((int32*)res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] + (cast(int32, (int8*)data_buf[(((((CONV2D_H*8256) + (ACC_kh*8256)) + (CONV2D_W*32)) + (ACC_kw*32)) + ACC_ic)])*cast(int32, (int8*)kernel_buf[((((ACC_kh*6144) + (ACC_kw*2048)) + (CONV2D_C*32)) + ACC_ic)])))
              }
            }
          }
        }
      }
    }
    #HERE
    # No copy statement from bias to bias_buf
    for (BADD_H: int32, 0, 512) {
      for (BADD_W: int32, 0, 256) {
        for (BADD_C: int32, 0, 64) {
          res_bias_2[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] = ((int32*)res_conv[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] + cast(int32, (int8*)bias_buf[BADD_C]))
        }
      }
    }
  }
}

To add to my confusion, I tried a small change to the bias layout to be [OC,].

import tvm
from tvm import te
import numpy as np

from tvm.contrib import utils

from tvm import topi
# 2D convolution layer dimensions taken from ResNet-18 architecture
# (9th convolutional layer)
batch_size = 1
height = 512
width = 256
in_channels = 32
out_channels = 64
kernel_h = 3
kernel_w = 3
stride_h = 1
stride_w = 1
pad_h = (kernel_h//2)*stride_h
pad_w = (kernel_w//2)*stride_w
# MaxPool2D
pool_h=2
pool_w=2
pool_type='max'

# Input feature map: (N, IC, H, W, n, ic)
data_shape = (batch_size ,
              height,
              width,
              in_channels)

# Kernel: (kh, kw, oc, ic)
kernel_shape = (kernel_h,
                kernel_w,
                out_channels,
                in_channels)

# Derive output feature map dimensions
fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1

# Output feature map: (N,H, W, C)
output_shape = (batch_size,
                fout_height,
                fout_width,
                out_channels)

# Convolution reduction axes
kh = te.reduce_axis((0, kernel_h), name='ACC_kh')
kw = te.reduce_axis((0, kernel_w), name='ACC_kw')
ic = te.reduce_axis((0, in_channels), name='ACC_ic')
#Compute rules

# Input placeholder tensors
data = te.placeholder(data_shape,
                       name="data",
                       dtype="int8")
kernel = te.placeholder(kernel_shape,
                         name="kernel",
                         dtype="int8")
bias = te.placeholder((out_channels,), #HERE
                        name="bias",
                        dtype="int8")

# Copy buffers:
#   Apply spatial padding to input feature map
data_buf = topi.nn.pad(data,
                       [0, pad_h, pad_w, 0],
                       name="data_buf")
kernel_buf = te.compute(kernel_shape, lambda KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC: kernel[KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC], "kernel_buf")

bias_buf = te.compute((out_channels,), lambda BiasBuff_C: bias[BiasBuff_C], "bias_buf")#HERE



# 2D convolution
res_conv = te.compute(
    output_shape,
    lambda CONV2D_N, CONV2D_H, CONV2D_W, CONV2D_C: te.sum(
      data_buf[CONV2D_N, CONV2D_H*stride_h+kh, CONV2D_W*stride_w+kw, ic].astype('int32') *
      kernel_buf[kh, kw, CONV2D_C, ic].astype('int32'),
    axis=[kh, kw, ic]),
    name="res_conv")

# Bias add
res_bias = te.compute ( output_shape, lambda BADD_N, BADD_H, BADD_W, BADD_C: res_conv[BADD_N, BADD_H, BADD_W, BADD_C]+ bias_buf[BADD_C], name='res_bias')

In this case, everything seems to work as I expected.

primfn(data_1: handle, kernel_1: handle, bias_1: handle, res_bias_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {bias: Buffer(bias_2: Pointer(int8), int8, [64], []),
             res_bias: Buffer(res_bias_2: Pointer(int32), int32, [1, 512, 256, 64], []),
             data: Buffer(data_2: Pointer(int8), int8, [1, 512, 256, 32], []),
             kernel: Buffer(kernel_2: Pointer(int8), int8, [3, 3, 64, 32], [])}
  buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, res_bias_1: res_bias} {
  attr [data_buf: Pointer(int8)] "storage_scope" = "global";
  allocate(data_buf, int8, [4243584]);
  attr [kernel_buf: Pointer(int8)] "storage_scope" = "global";
  allocate(kernel_buf, int8, [18432]);
  attr [res_conv: Pointer(int32)] "storage_scope" = "global";
  allocate(res_conv, int32, [8388608]);
  attr [bias_buf: Pointer(int8)] "storage_scope" = "global";
  allocate(bias_buf, int8, [64]) {
    for (i1: int32, 0, 514) {
      for (i2: int32, 0, 258) {
        for (i3: int32, 0, 32) {
          data_buf[(((i1*8256) + (i2*32)) + i3)] = @tir.if_then_else(((((1 <= i1) && (i1 < 513)) && (1 <= i2)) && (i2 < 257)), (int8*)data_2[((((i1*8192) + (i2*32)) + i3) - 8224)], 0i8, dtype=int8)
        }
      }
    }
    for (KernBuff_H: int32, 0, 3) {
      for (KernBuff_W: int32, 0, 3) {
        for (KernBuff_OC: int32, 0, 64) {
          for (KernBuff_IC: int32, 0, 32) {
            kernel_buf[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)] = (int8*)kernel_2[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)]
          }
        }
      }
    }
    for (CONV2D_H: int32, 0, 512) {
      for (CONV2D_W: int32, 0, 256) {
        for (CONV2D_C: int32, 0, 64) {
          res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = 0
          for (ACC_kh: int32, 0, 3) {
            for (ACC_kw: int32, 0, 3) {
              for (ACC_ic: int32, 0, 32) {
                res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = ((int32*)res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] + (cast(int32, (int8*)data_buf[(((((CONV2D_H*8256) + (ACC_kh*8256)) + (CONV2D_W*32)) + (ACC_kw*32)) + ACC_ic)])*cast(int32, (int8*)kernel_buf[((((ACC_kh*6144) + (ACC_kw*2048)) + (CONV2D_C*32)) + ACC_ic)])))
              }
            }
          }
        }
      }
    }
    # HERE
    # Bias buffer copy
    for (BiasBuff_C: int32, 0, 64) {
      bias_buf[BiasBuff_C] = (int8*)bias_2[BiasBuff_C]
    }
    for (BADD_H: int32, 0, 512) {
      for (BADD_W: int32, 0, 256) {
        for (BADD_C: int32, 0, 64) {
          res_bias_2[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] = ((int32*)res_conv[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] + cast(int32, (int8*)bias_buf[BADD_C]))
        }
      }
    }
  }
}

So my question is, why is there a difference if I defined the bias placeholder as (1,1,1,OC) as opposed to (OC,)? or maybe I did something wrong?

Thanks