Hello everyone,
I am facing a weird problem during TE->TIR transformation.
I wanted to create my own implementation of conv2d+bias_add for an accelerator (similar to VTA) as an exercise. The layout of the IFM is NHWC, the kernels are [HWOcIc] and the bias is [1,1,1,Oc].
The code I used follows. It has a lot of the same code as the conv2d tutorial of VTA. Look for the comment #HERE
for what I think is the important parts.
import tvm
from tvm import te
import numpy as np
from tvm.contrib import utils
from tvm import topi
# 2D convolution layer dimensions taken from ResNet-18 architecture
# (9th convolutional layer)
batch_size = 1
height = 512
width = 256
in_channels = 32
out_channels = 64
kernel_h = 3
kernel_w = 3
stride_h = 1
stride_w = 1
pad_h = (kernel_h//2)*stride_h
pad_w = (kernel_w//2)*stride_w
# MaxPool2D
pool_h=2
pool_w=2
pool_type='max'
# Input feature map: (N,H,W,C)
data_shape = (batch_size ,
height,
width,
in_channels)
# Kernel: (H,W,Oc,Ic)
kernel_shape = (kernel_h,
kernel_w,
out_channels,
in_channels)
# Derive output feature map dimensions
fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
# Output feature map: (N,H, W, C)
output_shape = (batch_size,
fout_height,
fout_width,
out_channels)
# Convolution reduction axes
kh = te.reduce_axis((0, kernel_h), name='kh')
kw = te.reduce_axis((0, kernel_w), name='kw')
ic = te.reduce_axis((0, in_channels), name='ic')
#Compute rules
# Input placeholder tensors
data = te.placeholder(data_shape,
name="data",
dtype="int8")
kernel = te.placeholder(kernel_shape,
name="kernel",
dtype="int8")
bias = te.placeholder((1,1,1,out_channels), #HERE
name="bias",
dtype="int8")
# Copy buffers:
# Apply spatial padding to input feature map
data_buf = topi.nn.pad(data,
[0, pad_h, pad_w, 0],
name="data_buf")
kernel_buf = te.compute(kernel_shape, lambda KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC: kernel[KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC], "kernel_buf")
bias_buf = te.compute((1,1,1,out_channels), lambda N,H,W,BiasBuff_C: bias[N,H,W,BiasBuff_C], "bias_buf")#HERE also bias[1,1,1,BiasBuff_C] doesnt work
# 2D convolution
res_conv = te.compute(
output_shape,
lambda CONV2D_N, CONV2D_H, CONV2D_W, CONV2D_C: te.sum(
data_buf[CONV2D_N, CONV2D_H*stride_h+kh, CONV2D_W*stride_w+kw, ic].astype('int32') *
kernel_buf[kh, kw, CONV2D_C, ic].astype('int32'),
axis=[kh, kw, ic]),
name="res_conv")
# Bias add
res_bias = te.compute ( output_shape, lambda BADD_N, BADD_H, BADD_W, BADD_C: res_conv[BADD_N, BADD_H, BADD_W, BADD_C]+ bias_buf[1,1,1,BADD_C], name='res_bias')
# Get the schedule
s = te.create_schedule(res_bias.op)
print(tvm.lower(s,[data,kernel,bias,res_bias]))
The problem I see is that there is no copy operation for the bias_buf from bias (placeholder).
So that would mean that the array values are undefined.
I am not really sure what is happening since it’s the same te.compute
constructs as for the data_buff and kernel_buf, which do appear.
The output printed is following (again look for #HERE
)
primfn(data_1: handle, kernel_1: handle, bias_1: handle, res_bias_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {bias: Buffer(bias_2: Pointer(int8), int8, [1, 1, 1, 64], []),
res_bias: Buffer(res_bias_2: Pointer(int32), int32, [1, 512, 256, 64], []),
data: Buffer(data_2: Pointer(int8), int8, [1, 512, 256, 32], []),
kernel: Buffer(kernel_2: Pointer(int8), int8, [3, 3, 64, 32], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, res_bias_1: res_bias} {
attr [data_buf: Pointer(int8)] "storage_scope" = "global";
allocate(data_buf, int8, [4243584]);
attr [kernel_buf: Pointer(int8)] "storage_scope" = "global";
allocate(kernel_buf, int8, [18432]);
attr [res_conv: Pointer(int32)] "storage_scope" = "global";
allocate(res_conv, int32, [8388608]);
attr [bias_buf: Pointer(int8)] "storage_scope" = "global";
allocate(bias_buf, int8, [64]) {
for (i1: int32, 0, 514) {
for (i2: int32, 0, 258) {
for (i3: int32, 0, 32) {
data_buf[(((i1*8256) + (i2*32)) + i3)] = @tir.if_then_else(((((1 <= i1) && (i1 < 513)) && (1 <= i2)) && (i2 < 257)), (int8*)data_2[((((i1*8192) + (i2*32)) + i3) - 8224)], 0i8, dtype=int8)
}
}
}
for (KernBuff_H: int32, 0, 3) {
for (KernBuff_W: int32, 0, 3) {
for (KernBuff_OC: int32, 0, 64) {
for (KernBuff_IC: int32, 0, 32) {
kernel_buf[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)] = (int8*)kernel_2[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)]
}
}
}
}
for (CONV2D_H: int32, 0, 512) {
for (CONV2D_W: int32, 0, 256) {
for (CONV2D_C: int32, 0, 64) {
res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = 0
for (ACC_kh: int32, 0, 3) {
for (ACC_kw: int32, 0, 3) {
for (ACC_ic: int32, 0, 32) {
res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = ((int32*)res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] + (cast(int32, (int8*)data_buf[(((((CONV2D_H*8256) + (ACC_kh*8256)) + (CONV2D_W*32)) + (ACC_kw*32)) + ACC_ic)])*cast(int32, (int8*)kernel_buf[((((ACC_kh*6144) + (ACC_kw*2048)) + (CONV2D_C*32)) + ACC_ic)])))
}
}
}
}
}
}
#HERE
# No copy statement from bias to bias_buf
for (BADD_H: int32, 0, 512) {
for (BADD_W: int32, 0, 256) {
for (BADD_C: int32, 0, 64) {
res_bias_2[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] = ((int32*)res_conv[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] + cast(int32, (int8*)bias_buf[BADD_C]))
}
}
}
}
}
To add to my confusion, I tried a small change to the bias layout to be [OC,].
import tvm
from tvm import te
import numpy as np
from tvm.contrib import utils
from tvm import topi
# 2D convolution layer dimensions taken from ResNet-18 architecture
# (9th convolutional layer)
batch_size = 1
height = 512
width = 256
in_channels = 32
out_channels = 64
kernel_h = 3
kernel_w = 3
stride_h = 1
stride_w = 1
pad_h = (kernel_h//2)*stride_h
pad_w = (kernel_w//2)*stride_w
# MaxPool2D
pool_h=2
pool_w=2
pool_type='max'
# Input feature map: (N, IC, H, W, n, ic)
data_shape = (batch_size ,
height,
width,
in_channels)
# Kernel: (kh, kw, oc, ic)
kernel_shape = (kernel_h,
kernel_w,
out_channels,
in_channels)
# Derive output feature map dimensions
fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
# Output feature map: (N,H, W, C)
output_shape = (batch_size,
fout_height,
fout_width,
out_channels)
# Convolution reduction axes
kh = te.reduce_axis((0, kernel_h), name='ACC_kh')
kw = te.reduce_axis((0, kernel_w), name='ACC_kw')
ic = te.reduce_axis((0, in_channels), name='ACC_ic')
#Compute rules
# Input placeholder tensors
data = te.placeholder(data_shape,
name="data",
dtype="int8")
kernel = te.placeholder(kernel_shape,
name="kernel",
dtype="int8")
bias = te.placeholder((out_channels,), #HERE
name="bias",
dtype="int8")
# Copy buffers:
# Apply spatial padding to input feature map
data_buf = topi.nn.pad(data,
[0, pad_h, pad_w, 0],
name="data_buf")
kernel_buf = te.compute(kernel_shape, lambda KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC: kernel[KernBuff_H, KernBuff_W, KernBuff_OC, KernBuff_IC], "kernel_buf")
bias_buf = te.compute((out_channels,), lambda BiasBuff_C: bias[BiasBuff_C], "bias_buf")#HERE
# 2D convolution
res_conv = te.compute(
output_shape,
lambda CONV2D_N, CONV2D_H, CONV2D_W, CONV2D_C: te.sum(
data_buf[CONV2D_N, CONV2D_H*stride_h+kh, CONV2D_W*stride_w+kw, ic].astype('int32') *
kernel_buf[kh, kw, CONV2D_C, ic].astype('int32'),
axis=[kh, kw, ic]),
name="res_conv")
# Bias add
res_bias = te.compute ( output_shape, lambda BADD_N, BADD_H, BADD_W, BADD_C: res_conv[BADD_N, BADD_H, BADD_W, BADD_C]+ bias_buf[BADD_C], name='res_bias')
In this case, everything seems to work as I expected.
primfn(data_1: handle, kernel_1: handle, bias_1: handle, res_bias_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {bias: Buffer(bias_2: Pointer(int8), int8, [64], []),
res_bias: Buffer(res_bias_2: Pointer(int32), int32, [1, 512, 256, 64], []),
data: Buffer(data_2: Pointer(int8), int8, [1, 512, 256, 32], []),
kernel: Buffer(kernel_2: Pointer(int8), int8, [3, 3, 64, 32], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, res_bias_1: res_bias} {
attr [data_buf: Pointer(int8)] "storage_scope" = "global";
allocate(data_buf, int8, [4243584]);
attr [kernel_buf: Pointer(int8)] "storage_scope" = "global";
allocate(kernel_buf, int8, [18432]);
attr [res_conv: Pointer(int32)] "storage_scope" = "global";
allocate(res_conv, int32, [8388608]);
attr [bias_buf: Pointer(int8)] "storage_scope" = "global";
allocate(bias_buf, int8, [64]) {
for (i1: int32, 0, 514) {
for (i2: int32, 0, 258) {
for (i3: int32, 0, 32) {
data_buf[(((i1*8256) + (i2*32)) + i3)] = @tir.if_then_else(((((1 <= i1) && (i1 < 513)) && (1 <= i2)) && (i2 < 257)), (int8*)data_2[((((i1*8192) + (i2*32)) + i3) - 8224)], 0i8, dtype=int8)
}
}
}
for (KernBuff_H: int32, 0, 3) {
for (KernBuff_W: int32, 0, 3) {
for (KernBuff_OC: int32, 0, 64) {
for (KernBuff_IC: int32, 0, 32) {
kernel_buf[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)] = (int8*)kernel_2[((((KernBuff_H*6144) + (KernBuff_W*2048)) + (KernBuff_OC*32)) + KernBuff_IC)]
}
}
}
}
for (CONV2D_H: int32, 0, 512) {
for (CONV2D_W: int32, 0, 256) {
for (CONV2D_C: int32, 0, 64) {
res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = 0
for (ACC_kh: int32, 0, 3) {
for (ACC_kw: int32, 0, 3) {
for (ACC_ic: int32, 0, 32) {
res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] = ((int32*)res_conv[(((CONV2D_H*16384) + (CONV2D_W*64)) + CONV2D_C)] + (cast(int32, (int8*)data_buf[(((((CONV2D_H*8256) + (ACC_kh*8256)) + (CONV2D_W*32)) + (ACC_kw*32)) + ACC_ic)])*cast(int32, (int8*)kernel_buf[((((ACC_kh*6144) + (ACC_kw*2048)) + (CONV2D_C*32)) + ACC_ic)])))
}
}
}
}
}
}
# HERE
# Bias buffer copy
for (BiasBuff_C: int32, 0, 64) {
bias_buf[BiasBuff_C] = (int8*)bias_2[BiasBuff_C]
}
for (BADD_H: int32, 0, 512) {
for (BADD_W: int32, 0, 256) {
for (BADD_C: int32, 0, 64) {
res_bias_2[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] = ((int32*)res_conv[(((BADD_H*16384) + (BADD_W*64)) + BADD_C)] + cast(int32, (int8*)bias_buf[BADD_C]))
}
}
}
}
}
So my question is, why is there a difference if I defined the bias placeholder as (1,1,1,OC) as opposed to (OC,)? or maybe I did something wrong?
Thanks