How to reduce store instrcutions

import math import numpy as np import tvm from tvm import te from tvm import autotvm from tvm import topi from vta.top.utils import is_packed_layout from vta.environment import get_env

@autotvm.register_topi_compute(“upsampling_packed.vta”) def upsampling( cfg, data, scale_h, scale_w, layout=“NCHW”, method=“nearest_neighbor”, align_corners=False, output_shape=None, ): env = get_env() batch, channels, in_height, in_width, in_batch, in_channels = data.shape data_shape = data.shape out_height = int(in_height.value * scale_h) out_width = int(in_width.value * scale_w) # data_buf = te.compute( # data_shape, # # lambda b_o, c_o, i, j, b_i, c_i:data[b_o, c_o, i, j, b_i, c_i].astype(env.acc_dtype), # lambda *i: data(*i).astype(env.acc_dtype), # name=“data_buf”, # tag=“data_buf” # ) # inp_scope # output = te.compute( # (batch, channels, out_height, out_width, in_batch, in_channels), # lambda b_o, c_o, i, j, b_i, c_i: data[b_o, c_o, te.round(te.floordiv(i, scale_h)).astype(env.acc_dtype), te.round(te.floordiv(j, scale_w)).astype(env.acc_dtype), b_i, c_i], # # .astype(env.out_dtype), # 因为直接展示的int32的数据导致出现了后面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决 # name=“upsampling_packed”, # tag=“upsampling_packed”, # )

output =  topi.nn.upsampling(
        # data_buf,
        data,
        scale_h=scale_h,
        scale_w=scale_w,
        layout=layout,
        method=method,
        align_corners=align_corners,
       # output_shape=( batch, channels, out_height, out_width)
)
output = te.compute(
    (batch, channels, out_height, out_width, in_batch, in_channels),
    lambda *i: output(*i) + 0, # 因为直接展示的int32的数据导致出现了后面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
    name="upsampling_add",
    tag="upsampling_add",
) # acc_scope 
output = te.compute(
    (batch, channels, out_height, out_width, in_batch, in_channels),
    # lambda b_o, c_o, i, j, b_i, c_i: output[b_o, c_o, i, j, b_i, c_i].astype(env.out_dtype), # 因为直接展示的int32的数据导致出现了后
     lambda *i: output(*i).astype(env.out_dtype),
    # 面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
    name="upsampling_res",
    tag="upsampling_res",
)
return output

@autotvm.register_topi_schedule(“upsampling_packed.vta”) def schedule_upsampling_packed(cfg, outs, layout=None): # assert layout == “NCHW”, “Only NCHW layout is supported for upsampling” assert len(outs) == 1

env = get_env()
output = outs[0]# output 是待优化的计算张量。
const_ops = []
ewise_inputs = []
ewise_ops = []
upsample_res = []
assert "int" in output.op.input_tensors[0].dtype


# _traverse(output.op)    

# data_buff_op = output.op.input_tensors[0].op.input_tensors[0].op.input_tensors[0] #  data_buff_add
resize_op = output.op.input_tensors[0].op.input_tensors[0]
upsample_packed = output.op.input_tensors[0]
# print(f'data_buff_op:{data_buff_op}')
print(f'resize_op:{resize_op}')
print(f'upsample_packed:{upsample_packed}')


batch, channels, in_height, in_width, batch_inner, channels_inner  = output.shape
# # Create schedule
s = te.create_schedule(output.op)
# # # # Reorder axes
bo, co, h, w, bi, ci= s[resize_op].op.axis
cfg.define_split("tile_bo", bo, num_outputs=2)
cfg.define_split("tile_co", co, num_outputs=2)
cfg.define_split("tile_h", h, num_outputs=2)
cfg.define_split("tile_w", w, num_outputs=2) 
cfg.define_split("title_bi", bi, num_outputs=2) 
cfg.define_split("tile_ci", ci, num_outputs=2)
cfg.define_knob("oc_nthread", [1, 2])
cfg.define_knob("h_nthread", [1, 2])

x_bo, x_co, x_h, x_w, x_bi, x_ci = s[output].op.axis
x_co0, x_co1= cfg["tile_co"].apply(s, output, x_co)
x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_h)
x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_w)
s[output].reorder(x_bo, x_i0, x_co0,  x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
store_pt = x_i0
# # x_co0
# # x_i0

# s[data_buff_op].set_scope(env.acc_scope)
s[resize_op].set_scope(env.acc_scope)
s[upsample_packed].set_scope(env.acc_scope)


x_co0_, x_co1_= cfg["tile_co"].apply(s, resize_op, co)
x_i0_, x_i1_ = cfg["tile_h"].apply(s, resize_op, h)
x_j0_, x_j1_ = cfg["tile_w"].apply(s, resize_op, w)      
s[resize_op].reorder(bo, x_i0_, x_co0_, x_j0_, x_co1_, x_i1_, x_j1_, bi, ci)



# s[data_buff_op].compute_at(s[upsample_packed], x_i0_)
s[resize_op].compute_at(s[output], store_pt)
s[upsample_packed].compute_at(s[output], store_pt)

# s[data_buff_op].pragma(s[data_buff_op].op.axis[0], env.dma_copy)
s[resize_op].pragma(bi, env.dma_copy)
s[upsample_packed].pragma(s[upsample_packed].op.axis[0], env.alu)


# # store_out = env.dma_copy
# # # 需要在将结果拷贝到dram上之前,将计算结果拷贝的output_memory (有inp_memory, acc_memory ,wgt_memory, output_memory 
s[output].pragma(x_bi, env.dma_copy)

return s

0: tvm::tir::CopyIntrinInjector::VisitStmt_(tvm::tir::AttrStmtNode const*) at /home/lay/tvm/src/tir/transforms/inject_copy_intrin.cc:51 resize[i3_outer * 32 + i5] = T.Cast(“int32”, T.Cast(“float32”, data[T.max(T.min(T.Cast(“int32”, T.round(T.float32(0.48936170339584351) * T.Cast(“float32”, i2_outer))), 23), 0) * 768 + T.max(T.min(T.Cast(“int32”, T.round(T.float32(0.48936170339584351) * T.Cast(“float32”, i3_outer))), 23), 0) * 32 + i5])) i2_outer = T.int32() data = T.Buffer((18432,), “int32”) i3_outer = T.int32() resize = T.Buffer((1536,), “int32”, scope=“local.acc_buffer”, align=32) File “/home/lay/tvm/src/tir/transforms/inject_copy_intrin.cc”, line 51 InternalError: Check failed: (MatchCopyPattern(op->body, &ret, &error_info)) is false: Cannot match copy pattern. The error is the ‘BufferLoadNode’ of body is a nullptr. The body is for i5 in range(32):

I’m using my own te.compute, but that creates a lot of stored instructions.

    output = te.compute(
        (batch, channels, out_height, out_width, in_batch, in_channels),
        lambda b_o, c_o, i, j, b_i, c_i: data[b_o, c_o, te.round(te.floordiv(i, scale_h)).astype(env.acc_dtype), te.round(te.floordiv(j, scale_w)).astype(env.acc_dtype), b_i, c_i],
        # .astype(env.out_dtype), # 因为直接展示的int32的数据导致出现了后面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
        name="upsampling_packed",
        tag="upsampling_packed",
    )

0: tvm::tir::CopyIntrinInjector::VisitStmt_(tvm::tir::AttrStmtNode const*) at /home/lay/tvm/src/tir/transforms/inject_copy_intrin.cc:51 resize[i3_outer * 32 + i5] = T.Cast(“int32”, T.Cast(“float32”, data[T.max(T.min(T.Cast(“int32”, T.round(T.float32(0.48936170339584351) * T.Cast(“float32”, i2_outer))), 23), 0) * 768 + T.max(T.min(T.Cast(“int32”, T.round(T.float32(0.48936170339584351) * T.Cast(“float32”, i3_outer))), 23), 0) * 32 + i5])) i2_outer = T.int32() data = T.Buffer((18432,), “int32”) i3_outer = T.int32() resize = T.Buffer((1536,), “int32”, scope=“local.acc_buffer”, align=32) File “/home/lay/tvm/src/tir/transforms/inject_copy_intrin.cc”, line 51 InternalError: Check failed: (MatchCopyPattern(op->body, &ret, &error_info)) is false: Cannot match copy pattern. The error is the ‘BufferLoadNode’ of body is a nullptr. The body is for i5 in range(32):

I’m using my own te.compute, but that creates a lot of stored instructions.

   output = te.compute(
        (batch, channels, out_height, out_width, in_batch, in_channels),
        lambda b_o, c_o, i, j, b_i, c_i: data[b_o, c_o, te.round(te.floordiv(i, scale_h)).astype(env.acc_dtype), te.round(te.floordiv(j, scale_w)).astype(env.acc_dtype), b_i, c_i],
        # .astype(env.out_dtype), # 因为直接展示的int32的数据导致出现了后面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
        name="upsampling_packed",
        tag="upsampling_packed",
    )