Generated CUDA Function Input

gkolhe · September 20, 2022, 7:44am

I have used matrix multiplication code from 4. Matrix Multiplication — Dive into Deep Learning Compiler 0.1 documentation (d2l.ai). However, instead of normal matrix multiplication, I have modified it to be a batched matrix multiplication.

The resulting code runs perfectly, but I want to run the resulting cuda code manually in C++ rather than compiling to pytorch function and use it python. To do this, I have been able to get the cuda code as well, but the resulting cuda kernel has many inputs, and I am not sure what would be the correct assignment.

For example, the resulting cuda code for batched multiplication has signature

default_function_kernel0(float* __restrict__ A, float* __restrict__ B, float* __restrict__ R, int d3, int d1, int stride, int bsz, int stride_1, int stride_2, int d2, int stride_3, int stride_4, int stride_5, int stride_6, int stride_7, int stride_8)

Here it’s pretty simple that A, B and C are matrix with d1, d2, d3 being its shape i.e. (m, n, k) and we have batch size. However, I am not sure what strides are. There are 9 of such strides and I have not used them while writing the TVM code.

Is there a way to know, when this function is executed through python what are the values? Because, once I know it, I can use those values and have the function executed on C++ as well.

The TVM code is as follows.

import pandas as pd
from pandas import DataFrame
import time
import tvm
from tvm.contrib import dlpack
from tvm import te
import torch
import numpy as np

b_blocksz, y_blocksz, x_blocksz, t_b, t_y, t_x, t_k = 1, 8, 16, 1, 8, 4, 16

def _compile_function():

    bsz = te.var('bsz') # Batch size
    d1 = te.var('d1')   # D1 -> # of rows of first matrix
    d2 = te.var('d2')   # D2 -> # of columns of first matrix
    d3 = te.var('d3')   # D3 -> # of Columns of first matrix and # of rows of second matrix

    A = te.placeholder((bsz, d1, d3), name='A', dtype='float32')  # first tensor
    B = te.placeholder((bsz, d3, d2), name='B', dtype='float32')  # second tensor

    k = te.reduce_axis((0, d3), name='k')  # dimension to sum over

    output_shape = (bsz, d1, d2)  # shape of the result tensor

    algorithm = lambda l, i, j: te.sum(A[l, i, k] * B[l, k, j], axis=k)
    R = te.compute(output_shape, algorithm, name='R')
    s = te.create_schedule(R.op)

    A_shared = s.cache_read(A, "shared", [R])
    A_local  = s.cache_read(A_shared, "local", [R])
    B_shared = s.cache_read(B, "shared", [R])
    B_local  = s.cache_read(B_shared, "local", [R])
    C_local  = s.cache_write(R, "local")
    

    batch, y, x = s[R].op.axis

    yo, yi           = s[R].split(y      , t_y) 
    yb, yo           = s[R].split(yo     , y_blocksz)
    xo, xi           = s[R].split(x      , t_x) 
    xb, xo           = s[R].split(xo     , x_blocksz)

    s[R].reorder(batch, yb, xb, yo, xo, yi, xi)

    s[R].bind(xb, te.thread_axis("blockIdx.x"))
    s[R].bind(yb, te.thread_axis("blockIdx.y"))
    s[R].bind(xo, te.thread_axis("threadIdx.x"))
    s[R].bind(yo, te.thread_axis("threadIdx.y"))

    s[C_local].compute_at(s[R], xo)
    batch_i, y_i, x_i = s[C_local].op.axis
    k,              = s[C_local].op.reduce_axis
    ko, ki          = s[C_local].split(k, t_k)

    s[C_local].reorder(batch_i, ko, ki, y_i, x_i)


    def optimize_read_cache(shared, local):
        s[shared].compute_at(s[C_local], ko)
        s[local ].compute_at(s[C_local], ki)
        

        b, y, x = s[shared].op.axis

        bo, bi  = s[shared].split(b, nparts=b_blocksz)
        yo, yi  = s[shared].split(y, nparts=y_blocksz)
        xo, xi  = s[shared].split(x, nparts=x_blocksz)
        

        s[shared].reorder(bo, bi, yo, xo, yi, xi)

        s[shared].bind(xo, te.thread_axis("threadIdx.x"))
        s[shared].bind(yo, te.thread_axis("threadIdx.y"))
        s[shared].bind(bo, te.thread_axis("threadIdx.z"))
        


    optimize_read_cache(A_shared, A_local)
    optimize_read_cache(B_shared, B_local)
    print('\n ===================== \n{}'.format(tvm.lower(s, [A, B, R], simple_mode=True)))
    return tvm.build(s, [A, B, R], target='cuda')

if __name__ == "__main__":
    custom_bmm = _compile_function()
    custom_bmm_pytorch = dlpack.to_pytorch_func(custom_bmm)  # wrap it as a pytorch function
    print(custom_bmm.imported_modules[0].get_source()) # to get cuda code