TVM generated OpenCL kernels

varunnaw · September 16, 2024, 11:30pm

How did you get the work group sizings from tvm for the opencl target on Adreno GPU?

I saw your samples here: qualcomm/apps/OpenCLCppRunner at master · Deelvin/qualcomm · GitHub

I see that you obtained work group sizings for a tvm generated kernel here: Deelvin tvm generated kernel example

I am trying to do the same thing, to compare tvm kernel performance to my own kernel performance.

Here’s how I get the kernel source complete python script to generate tvm conv kernel

import os

import numpy as np

import tvm
from tvm import relay, autotvm
import tvm.relay.testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.utils import tempdir
import tvm.contrib.graph_executor as runtime

target_str = "opencl"
# target_str = "opencl -device=adreno"
target = tvm.target.Target(target_str, host="llvm -mtriple=aarch64-linux-android")

input_name = "input"
filter_name = "weight"

input_shape=(1, 25, 25, 64)
filter_shape=(3, 3, 64, 96)
filter = np.random.rand(*filter_shape).astype(dtype)

input = tvm.relay.var("input", shape=input_shape, dtype=dtype)
weight = tvm.relay.var("weight", shape=filter_shape, dtype=dtype)
D = relay.nn.conv2d(input, weight, padding=(0, 0), data_layout="NHWC", kernel_layout="HWIO", out_dtype=dtype)

mod = relay.Function([input, weight], D)
params = {
   "weight": tvm.nd.array(filter)
}

with tvm.transform.PassContext(opt_level=3):
   graph, lib, params = relay.build_module.build(mod, target, params=params)

print(lib.imported_modules[0].get_source())

Here is the kernel source output from above python script

// Function: tvmgen_default_fused_nn_conv2d_kernel
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#elif defined(cl_amd_fp16)
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
#else
#error "Half precision floating point not supported by OpenCL implementation on your device." 
#endif

__kernel void tvmgen_default_fused_nn_conv2d_kernel(__global half* restrict conv2d_nhwc, __global half* restrict p0, __global half* restrict p1);
__kernel void tvmgen_default_fused_nn_conv2d_kernel(__global half* restrict conv2d_nhwc, __global half* restrict p0, __global half* restrict p1) {
  half conv2d_nhwc_local[2];
  __local half pad_temp_shared[24];
  __local half p1_shared[256];
  half pad_temp_shared_local[1];
  half p1_shared_local[2];
  for (int yy = 0; yy < 23; ++yy) {
    vstore2(((half2)((half)0.000000e+00f, (half)0.000000e+00f)), 0, conv2d_nhwc_local + 0);
    for (int rc_outer = 0; rc_outer < 4; ++rc_outer) {
      for (int ry = 0; ry < 3; ++ry) {
        for (int rx = 0; rx < 3; ++rx) {
          barrier(CLK_LOCAL_MEM_FENCE);
          pad_temp_shared[(((convert_int(get_local_id(1))) * 4) + (convert_int(get_local_id(0))))] = p0[(((((((yy * 1600) + (ry * 1600)) + ((convert_int(get_group_id(0))) * 64)) + (rx * 64)) + (rc_outer * 16)) + ((convert_int(get_local_id(1))) * 4)) + (convert_int(get_local_id(0))))];
          for (int ax2_ax3_fused_outer_outer_outer = 0; ax2_ax3_fused_outer_outer_outer < 8; ++ax2_ax3_fused_outer_outer_outer) {
            p1_shared[((((ax2_ax3_fused_outer_outer_outer * 32) + (((convert_int(get_local_id(1))) >> 1) * 16)) + (((convert_int(get_local_id(1))) & 1) * 4)) + (convert_int(get_local_id(0))))] = p1[((((((((ry * 18432) + (rx * 6144)) + (rc_outer * 1536)) + (ax2_ax3_fused_outer_outer_outer * 192)) + (((convert_int(get_local_id(1))) >> 1) * 96)) + ((convert_int(get_group_id(2))) * 8)) + (((convert_int(get_local_id(1))) & 1) * 4)) + (convert_int(get_local_id(0))))];
          }
          barrier(CLK_LOCAL_MEM_FENCE);
          for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
            if ((convert_int(get_local_id(1))) < 1) {
              pad_temp_shared_local[0] = pad_temp_shared[(((convert_int(get_local_id(1))) * 24) + rc_inner)];
            }
            for (int ax3 = 0; ax3 < 2; ++ax3) {
              p1_shared_local[ax3] = p1_shared[(((rc_inner * 16) + ((convert_int(get_local_id(0))) * 2)) + ax3)];
            }
            if ((convert_int(get_local_id(1))) < 1) {
              vstore2((vload2(0, conv2d_nhwc_local + 0) + (((half2)(pad_temp_shared_local[0], pad_temp_shared_local[0])) * vload2(0, p1_shared_local + 0))), 0, conv2d_nhwc_local + 0);
            }
          }
        }
      }
    }
    for (int ff_outer_inner = 0; ff_outer_inner < 2; ++ff_outer_inner) {
      if ((convert_int(get_local_id(1))) < 1) {
        conv2d_nhwc[(((((((convert_int(get_local_id(1))) * 50784) + (yy * 2208)) + ((convert_int(get_group_id(0))) * 96)) + ((convert_int(get_group_id(2))) * 8)) + ((convert_int(get_local_id(0))) * 2)) + ff_outer_inner)] = conv2d_nhwc_local[ff_outer_inner];
      }
    }
  }
}