Quantization and 3D convolution

OValery16 · November 13, 2020, 12:53pm

@tkonolige Thanks a lot for your help.

Regarding the tvm.lower(s, args), you can find below the generated code .

Before tuning, I got:

#[version = "0.0.5"]
primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1, 128, 18, 56, 56], []),
             W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []),
             A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])}
  buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} {
  attr [packed_data: Pointer(int8)] "storage_scope" = "global";
  allocate(packed_data, int8, [7225344]);
  attr [packed_kernel: Pointer(int8)] "storage_scope" = "global";
  allocate(packed_kernel, int8, [442368]) {
    attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
    attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
    for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) {
      if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) {
        if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x) < 7225344) {
          packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x)] = (int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) + (floormod(threadIdx.x, 4)*56448)) + floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))]
        }
      }
    }
    attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
    attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
    for (oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer: int32, 0, 2) {
      if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) {
        if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536) + (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) {
          if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) {
            packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1)] = (int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) + (floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) + (floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) + (floormod(threadIdx.x_1, 4)*27)) + floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))]
          }
        }
      }
    }
    attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")] "thread_extent" = 128;
    attr [compute: Pointer(int32)] "storage_scope" = "local";
    allocate(compute, int32, [1]);
    attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared";
    allocate(pad_data.shared, int8x4, [1]);
    attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared";
    allocate(packed_kernel.shared, int8x4, [1]);
    attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 18;
    attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 3136;
    attr [IterVar(threadIdx.z: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1;
    attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
    attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 {
      compute[0] = 0
      for (ic_chunk.outer: int32, 0, 32) {
        for (rz.outer: int32, 0, 3) {
          for (ry.outer: int32, 0, 3) {
            for (rx.outer: int32, 0, 3) {
              attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1;
              attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
              attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
              pad_data.shared[ramp(0, 1, 4)] = @tir.if_then_else(((((((1 <= (blockIdx.y + rz.outer)) && ((blockIdx.y + rz.outer) < 19)) && (1 <= (floordiv(blockIdx.x_2, 56) + ry.outer))) && ((floordiv(blockIdx.x_2, 56) + ry.outer) < 57)) && (1 <= (rx.outer + floormod(blockIdx.x_2, 56)))) && ((rx.outer + floormod(blockIdx.x_2, 56)) < 57)), (int8x4*)packed_data[ramp((((((((ic_chunk.outer*225792) + (blockIdx.y*12544)) + (rz.outer*12544)) + (ry.outer*224)) + (blockIdx.x_2*4)) + (rx.outer*4)) - 12772), 1, 4)], broadcast(0i8, 4), dtype=int8x4)
              attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1;
              attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
              attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
              packed_kernel.shared[ramp(0, 1, 4)] = (int8x4*)packed_kernel[ramp(((((((floordiv(blockIdx.z, 4)*13824) + (ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) + (floormod(blockIdx.z, 4)*4)), 1, 4)]
              compute[0] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(0, 1, 4)], (int8x4*)packed_kernel.shared[ramp(0, 1, 4)], (int32*)compute[0], dtype=int32)
            }
          }
        }
      }
      output_unpack_2[(((blockIdx.z*56448) + (blockIdx.y*3136)) + blockIdx.x_2)] = (int32*)compute[0]
    }
  }
}

#[metadata]
{
  "root": 1, 
  "nodes": [
    {
      "type_key": ""
    }, 
    {
      "type_key": "Map", 
      "keys": [
        "IntImm"
      ], 
      "data": [2]
    }, 
    {
      "type_key": "Array", 
      "data": [3]
    }, 
    {
      "type_key": "IntImm", 
      "attrs": {
        "dtype": "bool", 
        "value": "1"
      }
    }
  ], 
  "b64ndarrays": [], 
  "attrs": {"tvm_version": "0.8.dev0"}
}

After tuning, I got:

#[version = "0.0.5"]
primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1, 128, 18, 56, 56], []),
             W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []),
             A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])}
  buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} {
  attr [packed_data: Pointer(int8)] "storage_scope" = "global";
  allocate(packed_data, int8, [7225344]);
  attr [packed_kernel: Pointer(int8)] "storage_scope" = "global";
  allocate(packed_kernel, int8, [442368]) {
    attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
    attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
    for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) {
      if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) {
        if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x) < 7225344) {
          packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x)] = (int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) + (floormod(threadIdx.x, 4)*56448)) + floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))]
        }
      }
    }
    attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256;
    attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
    for (oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer: int32, 0, 2) {
      if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) {
        if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536) + (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) {
          if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) {
            packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1)] = (int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) + (floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) + (floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) + (floormod(threadIdx.x_1, 4)*27)) + floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))]
          }
        }
      }
    }
    attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")] "thread_extent" = 8;
    attr [compute: Pointer(int32)] "storage_scope" = "local";
    allocate(compute, int32, [(((floordiv(((threadIdx.z: int32*2) + 1), 4)*32) + 32) - (floordiv(threadIdx.z, 2)*32))]);
    attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared";
    allocate(pad_data.shared, int8x4, [56]);
    attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared";
    allocate(packed_kernel.shared, int8x4, [28]);
    attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 9;
    attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 112;
    attr [IterVar(threadIdx.z, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8;
    attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7;
    attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 {
      for (oc_chunk.init: int32, 0, ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2))) {
        for (zz.init: int32, 0, 2) "unroll" {
          for (yy.init: int32, 0, 2) "unroll" {
            for (oc_block.init: int32, 0, 4) "unroll" {
              compute[((((oc_chunk.init*16) + (zz.init*8)) + (yy.init*4)) + oc_block.init)] = 0
              compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk.init*16)) + (zz.init*8)) + (yy.init*4)) + oc_block.init) + 16) - (floordiv(threadIdx.z, 2)*16))] = 0
            }
          }
        }
      }
      for (rz.outer: int32, 0, 3) {
        for (ry.outer: int32, 0, 3) {
          for (ic_chunk.outer: int32, 0, 32) {
            for (rx.outer: int32, 0, 3) {
              attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8;
              attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7;
              attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
              pad_data.shared[ramp(((threadIdx.z_1*28) + (threadIdx.y_1*4)), 1, 4)] = @tir.if_then_else(((((((1 <= (((blockIdx.y*2) + floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) + rz.outer)) && ((((blockIdx.y*2) + floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) + rz.outer) < 19)) && (1 <= (((floordiv(blockIdx.x_2, 28)*14) + floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)) + ry.outer))) && ((((floordiv(blockIdx.x_2, 28)*14) + floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)) + ry.outer) < 57)) && (1 <= (((floormod(blockIdx.x_2, 28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)))) && ((((floormod(blockIdx.x_2, 28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)) < 57)), (int8x4*)packed_data[ramp((((((((((((ic_chunk.outer*225792) + (blockIdx.y*25088)) + (floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)*12544)) + (rz.outer*12544)) + (floordiv(blockIdx.x_2, 28)*3136)) + (floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)*224)) + (ry.outer*224)) + (floormod(blockIdx.x_2, 28)*8)) + (rx.outer*4)) + (floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)*4)) - 12772), 1, 4)], broadcast(0i8, 4), dtype=int8x4)
              attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8;
              attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7;
              attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
              if (((threadIdx.z_2*7) + threadIdx.y_2) < 28) {
                if (threadIdx.z_2 < 4) {
                  if (((blockIdx.z*4) + floordiv(((threadIdx.z_2*7) + threadIdx.y_2), 4)) < 32) {
                    packed_kernel.shared[ramp(((threadIdx.z_2*28) + (threadIdx.y_2*4)), 1, 4)] = (int8x4*)packed_kernel[ramp((((((((blockIdx.z*55296) + (floordiv(((threadIdx.z_2*7) + threadIdx.y_2), 4)*13824)) + (ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) + (floormod(((threadIdx.z_2*7) + threadIdx.y_2), 4)*4)), 1, 4)]
                  }
                }
              }
              for (oc_chunk: int32, 0, min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))) {
                for (zz: int32, 0, 2) "unroll" {
                  for (yy: int32, 0, 2) "unroll" {
                    for (oc_block: int32, 0, 4) "unroll" {
                      compute[((((oc_chunk*16) + (zz*8)) + (yy*4)) + oc_block)] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz*112) + (threadIdx.y*16)) + (yy*8)), 1, 4)], (int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) + (oc_chunk*16)) + (oc_block*4)), 1, 4)], (int32*)compute[((((oc_chunk*16) + (zz*8)) + (yy*4)) + oc_block)], dtype=int32)
                      compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk*16)) + (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z, 2)*16))] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(((((zz*112) + (threadIdx.y*16)) + (yy*8)) + 4), 1, 4)], (int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) + (oc_chunk*16)) + (oc_block*4)), 1, 4)], (int32*)compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk*16)) + (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z, 2)*16))], dtype=int32)
                    }
                  }
                }
              }
              for (oc_chunk_1: int32, 0, (max(((((blockIdx.z*4) + floordiv(((threadIdx.z*2) + 1), 4)) - floordiv(threadIdx.z, 2)) - 29), -1) + 1)) {
                for (zz_1: int32, 0, 2) "unroll" {
                  for (yy_1: int32, 0, 2) "unroll" {
                    for (oc_block_1: int32, 0, 4) "unroll" {
                      if (((((blockIdx.z*4) + floordiv(threadIdx.z, 2)) + min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))) + oc_chunk_1) < 32) {
                        compute[(((((min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1)] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz_1*112) + (threadIdx.y*16)) + (yy_1*8)), 1, 4)], (int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)], (int32*)compute[(((((min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1)], dtype=int32)
                        compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(((((zz_1*112) + (threadIdx.y*16)) + (yy_1*8)) + 4), 1, 4)], (int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)], (int32*)compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))], dtype=int32)
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
      for (c.inner.inner.inner: int32, 0, 2) "unroll" {
        for (z.inner.inner.inner: int32, 0, 2) "unroll" {
          for (h.inner.inner.inner: int32, 0, 2) "unroll" {
            output_unpack_2[(((((((((blockIdx.z*903168) + (threadIdx.z*112896)) + (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) + (z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) + (threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2, 28)*2))] = (int32*)compute[(((((floordiv(((threadIdx.z*2) + c.inner.inner.inner), 4)*16) + (z.inner.inner.inner*8)) + (h.inner.inner.inner*4)) + floormod(((threadIdx.z*2) + c.inner.inner.inner), 4)) - (floordiv(threadIdx.z, 2)*16))]
            output_unpack_2[((((((((((blockIdx.z*903168) + (threadIdx.z*112896)) + (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) + (z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) + (threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2, 28)*2)) + 1)] = (int32*)compute[(((((((floordiv(((threadIdx.z*2) + c.inner.inner.inner), 4)*16) + (floordiv(((threadIdx.z*2) + 1), 4)*16)) + (z.inner.inner.inner*8)) + (h.inner.inner.inner*4)) + floormod(((threadIdx.z*2) + c.inner.inner.inner), 4)) + 16) - (floordiv(threadIdx.z, 2)*32))]
          }
        }
      }
    }
  }
}

#[metadata]
{
  "root": 1, 
  "nodes": [
    {
      "type_key": ""
    }, 
    {
      "type_key": "Map", 
      "keys": [
        "IntImm"
      ], 
      "data": [2]
    }, 
    {
      "type_key": "Array", 
      "data": [3]
    }, 
    {
      "type_key": "IntImm", 
      "attrs": {
        "dtype": "bool", 
        "value": "1"
      }
    }
  ], 
  "b64ndarrays": [], 
  "attrs": {"tvm_version": "0.8.dev0"}
}