@tkonolige Thanks a lot for your help.
Regarding the tvm.lower(s, args), you can find below the generated code .
Before tuning, I got:
#[version = "0.0.5"]
primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1, 128, 18, 56, 56], []),
W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []),
A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])}
buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} {
attr [packed_data: Pointer(int8)] "storage_scope" = "global";
allocate(packed_data, int8, [7225344]);
attr [packed_kernel: Pointer(int8)] "storage_scope" = "global";
allocate(packed_kernel, int8, [442368]) {
attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x) < 7225344) {
packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x)] = (int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) + (floormod(threadIdx.x, 4)*56448)) + floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))]
}
}
}
attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
for (oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer: int32, 0, 2) {
if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) {
if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536) + (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) {
if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) {
packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1)] = (int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) + (floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) + (floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) + (floormod(threadIdx.x_1, 4)*27)) + floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))]
}
}
}
}
attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")] "thread_extent" = 128;
attr [compute: Pointer(int32)] "storage_scope" = "local";
allocate(compute, int32, [1]);
attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(pad_data.shared, int8x4, [1]);
attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(packed_kernel.shared, int8x4, [1]);
attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 18;
attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 3136;
attr [IterVar(threadIdx.z: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1;
attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 {
compute[0] = 0
for (ic_chunk.outer: int32, 0, 32) {
for (rz.outer: int32, 0, 3) {
for (ry.outer: int32, 0, 3) {
for (rx.outer: int32, 0, 3) {
attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1;
attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
pad_data.shared[ramp(0, 1, 4)] = @tir.if_then_else(((((((1 <= (blockIdx.y + rz.outer)) && ((blockIdx.y + rz.outer) < 19)) && (1 <= (floordiv(blockIdx.x_2, 56) + ry.outer))) && ((floordiv(blockIdx.x_2, 56) + ry.outer) < 57)) && (1 <= (rx.outer + floormod(blockIdx.x_2, 56)))) && ((rx.outer + floormod(blockIdx.x_2, 56)) < 57)), (int8x4*)packed_data[ramp((((((((ic_chunk.outer*225792) + (blockIdx.y*12544)) + (rz.outer*12544)) + (ry.outer*224)) + (blockIdx.x_2*4)) + (rx.outer*4)) - 12772), 1, 4)], broadcast(0i8, 4), dtype=int8x4)
attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1;
attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
packed_kernel.shared[ramp(0, 1, 4)] = (int8x4*)packed_kernel[ramp(((((((floordiv(blockIdx.z, 4)*13824) + (ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) + (floormod(blockIdx.z, 4)*4)), 1, 4)]
compute[0] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(0, 1, 4)], (int8x4*)packed_kernel.shared[ramp(0, 1, 4)], (int32*)compute[0], dtype=int32)
}
}
}
}
output_unpack_2[(((blockIdx.z*56448) + (blockIdx.y*3136)) + blockIdx.x_2)] = (int32*)compute[0]
}
}
}
#[metadata]
{
"root": 1,
"nodes": [
{
"type_key": ""
},
{
"type_key": "Map",
"keys": [
"IntImm"
],
"data": [2]
},
{
"type_key": "Array",
"data": [3]
},
{
"type_key": "IntImm",
"attrs": {
"dtype": "bool",
"value": "1"
}
}
],
"b64ndarrays": [],
"attrs": {"tvm_version": "0.8.dev0"}
}
After tuning, I got:
#[version = "0.0.5"]
primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1, 128, 18, 56, 56], []),
W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []),
A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])}
buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} {
attr [packed_data: Pointer(int8)] "storage_scope" = "global";
allocate(packed_data, int8, [7225344]);
attr [packed_kernel: Pointer(int8)] "storage_scope" = "global";
allocate(packed_kernel, int8, [442368]) {
attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x) < 7225344) {
packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x)] = (int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) + (floormod(threadIdx.x, 4)*56448)) + floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))]
}
}
}
attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
for (oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer: int32, 0, 2) {
if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) {
if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536) + (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) {
if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) {
packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1)] = (int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) + (floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) + (floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) + (floormod(threadIdx.x_1, 4)*27)) + floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))]
}
}
}
}
attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")] "thread_extent" = 8;
attr [compute: Pointer(int32)] "storage_scope" = "local";
allocate(compute, int32, [(((floordiv(((threadIdx.z: int32*2) + 1), 4)*32) + 32) - (floordiv(threadIdx.z, 2)*32))]);
attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(pad_data.shared, int8x4, [56]);
attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(packed_kernel.shared, int8x4, [28]);
attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 9;
attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 112;
attr [IterVar(threadIdx.z, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8;
attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7;
attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 {
for (oc_chunk.init: int32, 0, ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2))) {
for (zz.init: int32, 0, 2) "unroll" {
for (yy.init: int32, 0, 2) "unroll" {
for (oc_block.init: int32, 0, 4) "unroll" {
compute[((((oc_chunk.init*16) + (zz.init*8)) + (yy.init*4)) + oc_block.init)] = 0
compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk.init*16)) + (zz.init*8)) + (yy.init*4)) + oc_block.init) + 16) - (floordiv(threadIdx.z, 2)*16))] = 0
}
}
}
}
for (rz.outer: int32, 0, 3) {
for (ry.outer: int32, 0, 3) {
for (ic_chunk.outer: int32, 0, 32) {
for (rx.outer: int32, 0, 3) {
attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8;
attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7;
attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
pad_data.shared[ramp(((threadIdx.z_1*28) + (threadIdx.y_1*4)), 1, 4)] = @tir.if_then_else(((((((1 <= (((blockIdx.y*2) + floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) + rz.outer)) && ((((blockIdx.y*2) + floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) + rz.outer) < 19)) && (1 <= (((floordiv(blockIdx.x_2, 28)*14) + floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)) + ry.outer))) && ((((floordiv(blockIdx.x_2, 28)*14) + floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)) + ry.outer) < 57)) && (1 <= (((floormod(blockIdx.x_2, 28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)))) && ((((floormod(blockIdx.x_2, 28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)) < 57)), (int8x4*)packed_data[ramp((((((((((((ic_chunk.outer*225792) + (blockIdx.y*25088)) + (floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)*12544)) + (rz.outer*12544)) + (floordiv(blockIdx.x_2, 28)*3136)) + (floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)*224)) + (ry.outer*224)) + (floormod(blockIdx.x_2, 28)*8)) + (rx.outer*4)) + (floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)*4)) - 12772), 1, 4)], broadcast(0i8, 4), dtype=int8x4)
attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8;
attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7;
attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
if (((threadIdx.z_2*7) + threadIdx.y_2) < 28) {
if (threadIdx.z_2 < 4) {
if (((blockIdx.z*4) + floordiv(((threadIdx.z_2*7) + threadIdx.y_2), 4)) < 32) {
packed_kernel.shared[ramp(((threadIdx.z_2*28) + (threadIdx.y_2*4)), 1, 4)] = (int8x4*)packed_kernel[ramp((((((((blockIdx.z*55296) + (floordiv(((threadIdx.z_2*7) + threadIdx.y_2), 4)*13824)) + (ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) + (floormod(((threadIdx.z_2*7) + threadIdx.y_2), 4)*4)), 1, 4)]
}
}
}
for (oc_chunk: int32, 0, min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))) {
for (zz: int32, 0, 2) "unroll" {
for (yy: int32, 0, 2) "unroll" {
for (oc_block: int32, 0, 4) "unroll" {
compute[((((oc_chunk*16) + (zz*8)) + (yy*4)) + oc_block)] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz*112) + (threadIdx.y*16)) + (yy*8)), 1, 4)], (int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) + (oc_chunk*16)) + (oc_block*4)), 1, 4)], (int32*)compute[((((oc_chunk*16) + (zz*8)) + (yy*4)) + oc_block)], dtype=int32)
compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk*16)) + (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z, 2)*16))] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(((((zz*112) + (threadIdx.y*16)) + (yy*8)) + 4), 1, 4)], (int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) + (oc_chunk*16)) + (oc_block*4)), 1, 4)], (int32*)compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk*16)) + (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z, 2)*16))], dtype=int32)
}
}
}
}
for (oc_chunk_1: int32, 0, (max(((((blockIdx.z*4) + floordiv(((threadIdx.z*2) + 1), 4)) - floordiv(threadIdx.z, 2)) - 29), -1) + 1)) {
for (zz_1: int32, 0, 2) "unroll" {
for (yy_1: int32, 0, 2) "unroll" {
for (oc_block_1: int32, 0, 4) "unroll" {
if (((((blockIdx.z*4) + floordiv(threadIdx.z, 2)) + min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))) + oc_chunk_1) < 32) {
compute[(((((min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1)] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz_1*112) + (threadIdx.y*16)) + (yy_1*8)), 1, 4)], (int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)], (int32*)compute[(((((min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1)], dtype=int32)
compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(((((zz_1*112) + (threadIdx.y*16)) + (yy_1*8)) + 4), 1, 4)], (int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)], (int32*)compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))], dtype=int32)
}
}
}
}
}
}
}
}
}
for (c.inner.inner.inner: int32, 0, 2) "unroll" {
for (z.inner.inner.inner: int32, 0, 2) "unroll" {
for (h.inner.inner.inner: int32, 0, 2) "unroll" {
output_unpack_2[(((((((((blockIdx.z*903168) + (threadIdx.z*112896)) + (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) + (z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) + (threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2, 28)*2))] = (int32*)compute[(((((floordiv(((threadIdx.z*2) + c.inner.inner.inner), 4)*16) + (z.inner.inner.inner*8)) + (h.inner.inner.inner*4)) + floormod(((threadIdx.z*2) + c.inner.inner.inner), 4)) - (floordiv(threadIdx.z, 2)*16))]
output_unpack_2[((((((((((blockIdx.z*903168) + (threadIdx.z*112896)) + (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) + (z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) + (threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2, 28)*2)) + 1)] = (int32*)compute[(((((((floordiv(((threadIdx.z*2) + c.inner.inner.inner), 4)*16) + (floordiv(((threadIdx.z*2) + 1), 4)*16)) + (z.inner.inner.inner*8)) + (h.inner.inner.inner*4)) + floormod(((threadIdx.z*2) + c.inner.inner.inner), 4)) + 16) - (floordiv(threadIdx.z, 2)*32))]
}
}
}
}
}
}
#[metadata]
{
"root": 1,
"nodes": [
{
"type_key": ""
},
{
"type_key": "Map",
"keys": [
"IntImm"
],
"data": [2]
},
{
"type_key": "Array",
"data": [3]
},
{
"type_key": "IntImm",
"attrs": {
"dtype": "bool",
"value": "1"
}
}
],
"b64ndarrays": [],
"attrs": {"tvm_version": "0.8.dev0"}
}