Hi!
I’m trying to use TVM autoscheduler to generate cuda source code and use it elsewhere. But I don’t know how to get the shape of some tensors built by TVM.
I successfully get the source code of my workload, but the result is in the form of 3 different kernels.(it looks like this)
extern "C" __global__ void __launch_bounds__(32) myop_kernel2(float* __restrict__ output_node, float* __restrict__ extracted_reduction) {...}
extern "C" __global__ void __launch_bounds__(240) myop_kernel0(float* __restrict__ input_node, float* __restrict__ mid_node) {...}
extern "C" __global__ void __launch_bounds__(32) myop_kernel1(float* __restrict__ extracted_reduction, float* __restrict__ mid_node) {...}
To launch these kernels, the shapes of extracted_reduction
and mid_node
are necessary(When call cudaMalloc). But I have no idea how to get these information.
It seems that such a module contains an llvm module and some cuda modules, and the llvm module is build with an IRmodule. But I can’t find any useful tool to get these information from either llvm module nor IRmodule.
Here is how I tried to solve this problem myself(but failed):
I have tried to read the llvm source code of the same module, it looks like this.(here are the first part and last part of this llvm module)
%0 = type { double }
%1 = type { i8*, %2, i32, %3, i64*, i64*, i64 }
; Function Attrs: noinline
define internal fastcc i32 @myop_compute_(%0* noalias, i8* noalias align 128, i32* noalias, i8* noalias align 128, i32) unnamed_addr #1 {
entry:
%5 = alloca i8*, align 8
%6 = alloca i8*, align 8
... ...
%52 = load i8*, i8** @.tvm_func.myop_kernel2, align 8
... ...
handle_init_end16: ; preds = %call_end14, %call_end18
%58 = phi i8* [ %52, %call_end14 ], [ %62, %call_end18 ]
%59 = load i32 (i8*, %0*, i32*, i32, %0*, i32*)*, i32 (i8*, %0*, i32*, i32, %0*, i32*)** @__TVMFuncCall, align 8, !tbaa !20
%60 = call i32 %59(i8* %58, %0* nonnull %0, i32* nonnull %2, i32 4, %0* nonnull %20, i32* nonnull %21)
It seemes that this llvm module loads myop_kernel2
into %52 and %58. It calls __TVMFuncCall
and uses %58 as first argument. I read the definition of TVMFuncCall
int TVMFuncCall(TVMFunctionHandle func, TVMValue* args, int* arg_type_codes, int num_args, TVMValue* ret_val, int* ret_type_code)
It seems that the second argument of TVMFuncCall TVMValue* args
should be the argument passed to func(which is @.tvm_func.myop_kernel2 here). But the llvm source code uses %0* nonnull %0
as second argument. The %0*
represents double* because of %0 = type { double }
I gauss.
But I don’t know what does nonnull %0
refer to. I guess it refers to the frist argument of myop_compute_
(define i32 @myop_compute_(%0* noalias, ...
, does it mean %0* noalias %0
by default? )
If that is correct, %0 comes from .sub
in the caller of myop_compute_
.
define dllexport i32 @myop(i8* noalias nocapture readonly, i8* noalias nocapture readonly, i32, i8* noalias nocapture readnone, i8* noalias nocapture readnone, i8* noalias nocapture readnone) local_unnamed_addr !dbg !5 {
entry:
... ...
%7 = alloca [7 x %0], align 8, !dbg !18
%.sub = getelementptr inbounds [7 x %0], [7 x %0]* %7, i64 0, i64 0
... ...
%172 = load i8*, i8** @.tvm_func.__tvm_set_device, align 8, !dbg !18
... ...
%181 = load i32 (i8*, %0*, i32*, i32, %0*, i32*)*, i32 (i8*, %0*, i32*, i32, %0*, i32*)** @__TVMFuncCall, align 8, !dbg !18, !tbaa !20
%182 = call i32 %181(i8* %172, %0* nonnull %.sub, i32* nonnull %.sub56, i32 2, %0* nonnull %179, i32* nonnull %180), !dbg !18
... ...
%185 = call fastcc i32 @myop_compute_(%0* nonnull %.sub, i8* %30, i32* nonnull %.sub56, i8* %22, i32 %28), !dbg !18
So .sub
should be a [7 * double*] array allocaed locally and initialized by @.tvm_func.__tvm_set_device
(maybe?) But I can’t get anything useful in __tvm_set_device
so I have no idea how to reach my goal now.
complete llvm source code is below
%0 = type { double }
%1 = type { i8*, %2, i32, %3, i64*, i64*, i64 }
%2 = type { i32, i32 }
%3 = type { i8, i8, i16 }
@.str = private constant [140 x i8] c"Assert fail: (num_args == 2), winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0: num_args should be 2\00", align 1
@.tvm_func.__tvm_set_device = internal unnamed_addr global i8* null, align 8
@.str.22 = private constant [17 x i8] c"__tvm_set_device\00", align 1
@__TVMBackendAllocWorkspace = linkonce dllexport local_unnamed_addr global i8* (i32, i32, i64, i32, i32)* null, align 8
@.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel0 = internal unnamed_addr global i8* null, align 8
@.str.23 = private constant [96 x i8] c"winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel0\00", align 1
@.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel1 = internal unnamed_addr global i8* null, align 8
@.str.24 = private constant [96 x i8] c"winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel1\00", align 1
@.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel2 = internal unnamed_addr global i8* null, align 8
@.str.25 = private constant [96 x i8] c"winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel2\00", align 1
@__TVMBackendFreeWorkspace = linkonce dllexport local_unnamed_addr global i32 (i32, i32, i8*)* null, align 8
@tvm_main = weak dllexport local_unnamed_addr constant [88 x i8] c"winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0\00", align 1
@llvm.global_ctors = appending global [0 x { i32, void (), i8 }] zeroinitializer
define dllexport i32 @winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0(i8* noalias nocapture readonly, i8* noalias nocapture readonly, i32, i8* noalias nocapture readnone, i8* noalias nocapture readnone, i8* noalias nocapture readnone) local_unnamed_addr !dbg !5 {
entry:
call void @llvm.dbg.value(metadata i8* %0, metadata !12, metadata !DIExpression()), !dbg !18
call void @llvm.dbg.value(metadata i8* %1, metadata !13, metadata !DIExpression()), !dbg !18
call void @llvm.dbg.value(metadata i32 %2, metadata !14, metadata !DIExpression()), !dbg !18
call void @llvm.dbg.value(metadata i8* %3, metadata !15, metadata !DIExpression()), !dbg !18
call void @llvm.dbg.value(metadata i8* %4, metadata !16, metadata !DIExpression()), !dbg !18
call void @llvm.dbg.value(metadata i8* %5, metadata !17, metadata !DIExpression()), !dbg !18
%6 = alloca i8*, align 8, !dbg !18
%7 = alloca [7 x %0], align 8, !dbg !18
%8 = alloca [7 x i32], align 4, !dbg !18
%.sub56 = getelementptr inbounds [7 x i32], [7 x i32]* %8, i64 0, i64 0
%.sub = getelementptr inbounds [7 x %0], [7 x %0]* %7, i64 0, i64 0
%9 = icmp eq i32 %2, 2, !dbg !18
br i1 %9, label %assert_end, label %assert_fail, !dbg !18, !prof !19
assert_fail: ; preds = %entry
%10 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %10(i8* getelementptr inbounds ([140 x i8], [140 x i8]* @.str, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end: ; preds = %entry
%11 = bitcast i8* %0 to %1**, !dbg !18
%12 = load %1*, %1** %11, align 8, !dbg !18
%13 = bitcast i8* %1 to i32*, !dbg !18
%14 = load i32, i32* %13, align 4, !dbg !18, !tbaa !23
%15 = getelementptr inbounds i8, i8* %0, i64 8, !dbg !18
%16 = bitcast i8* %15 to %1**, !dbg !18
%17 = load %1*, %1** %16, align 8, !dbg !18
%18 = getelementptr inbounds i8, i8* %1, i64 4, !dbg !18
%19 = bitcast i8* %18 to i32*, !dbg !18
%20 = load i32, i32* %19, align 4, !dbg !18, !tbaa !37
%21 = getelementptr inbounds %1, %1* %12, i64 0, i32 0, !dbg !18
%22 = load i8*, i8** %21, align 8, !dbg !18
%ptrint = ptrtoint i8* %22 to i64, !dbg !18
%maskedptr = and i64 %ptrint, 127, !dbg !18
%maskcond = icmp eq i64 %maskedptr, 0, !dbg !18
tail call void @llvm.assume(i1 %maskcond), !dbg !18
%23 = getelementptr inbounds %1, %1* %12, i64 0, i32 4, !dbg !18
%24 = load i64*, i64** %23, align 8, !dbg !18
%25 = getelementptr inbounds %1, %1* %12, i64 0, i32 5, !dbg !18
%26 = load i64*, i64** %25, align 8, !dbg !18
%27 = getelementptr inbounds %1, %1* %12, i64 0, i32 1, i32 1, !dbg !18
%28 = load i32, i32* %27, align 4, !dbg !18
%29 = getelementptr inbounds %1, %1* %17, i64 0, i32 0, !dbg !18
%30 = load i8*, i8** %29, align 8, !dbg !18
%ptrint1 = ptrtoint i8* %30 to i64, !dbg !18
%maskedptr2 = and i64 %ptrint1, 127, !dbg !18
%maskcond3 = icmp eq i64 %maskedptr2, 0, !dbg !18
tail call void @llvm.assume(i1 %maskcond3), !dbg !18
%31 = getelementptr inbounds %1, %1* %17, i64 0, i32 4, !dbg !18
%32 = load i64*, i64** %31, align 8, !dbg !18
%33 = getelementptr inbounds %1, %1* %17, i64 0, i32 5, !dbg !18
%34 = load i64*, i64** %33, align 8, !dbg !18
switch i32 %14, label %assert_fail4 [
i32 13, label %assert_end5
i32 7, label %assert_end5
i32 4, label %assert_end5
i32 3, label %assert_end5
], !dbg !18
assert_fail4: ; preds = %assert_end
%35 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %35(i8* getelementptr inbounds ([215 x i8], [215 x i8]* @.str.1, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end5: ; preds = %assert_end, %assert_end, %assert_end, %assert_end
switch i32 %20, label %assert_fail6 [
i32 13, label %assert_end7
i32 7, label %assert_end7
i32 4, label %assert_end7
i32 3, label %assert_end7
], !dbg !18
assert_fail6: ; preds = %assert_end5
%36 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %36(i8* getelementptr inbounds ([215 x i8], [215 x i8]* @.str.2, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end7: ; preds = %assert_end5, %assert_end5, %assert_end5, %assert_end5
%37 = getelementptr inbounds %1, %1* %12, i64 0, i32 2, !dbg !18
%38 = load i32, i32* %37, align 4, !dbg !18
%39 = icmp eq i32 %38, 4, !dbg !18
br i1 %39, label %assert_end11, label %assert_fail8, !dbg !18, !prof !19
assert_fail8: ; preds = %assert_end7
%40 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %40(i8* getelementptr inbounds ([85 x i8], [85 x i8]* @.str.3, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end11: ; preds = %assert_end7
%41 = getelementptr inbounds %1, %1* %12, i64 0, i32 3, i32 2, !dbg !18
%42 = load i16, i16* %41, align 2, !dbg !18
%43 = icmp eq i16 %42, 1, !dbg !18
%44 = getelementptr inbounds %1, %1* %12, i64 0, i32 3, i32 1, !dbg !18
%45 = load i8, i8* %44, align 1, !dbg !18
%46 = icmp eq i8 %45, 32, !dbg !18
%47 = getelementptr inbounds %1, %1* %12, i64 0, i32 3, i32 0, !dbg !18
%48 = load i8, i8* %47, align 1, !dbg !18
%49 = icmp eq i8 %48, 2, !dbg !18
%50 = and i1 %46, %49, !dbg !18
%51 = and i1 %43, %50, !dbg !18
br i1 %51, label %assert_end13, label %assert_fail12, !dbg !18, !prof !19
assert_fail12: ; preds = %assert_end11
%52 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %52(i8* getelementptr inbounds ([198 x i8], [198 x i8]* @.str.4, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end13: ; preds = %assert_end11
%53 = load i64, i64* %24, align 8, !dbg !18, !tbaa !39
%54 = trunc i64 %53 to i32, !dbg !18
%55 = icmp eq i32 %54, 5, !dbg !18
br i1 %55, label %assert_end15, label %assert_fail14, !dbg !18, !prof !19
assert_fail14: ; preds = %assert_end13
%56 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %56(i8* getelementptr inbounds ([124 x i8], [124 x i8]* @.str.5, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end15: ; preds = %assert_end13
%57 = getelementptr inbounds i64, i64* %24, i64 1, !dbg !18
%58 = load i64, i64* %57, align 8, !dbg !18, !tbaa !53
%59 = trunc i64 %58 to i32, !dbg !18
%60 = icmp eq i32 %59, 3, !dbg !18
br i1 %60, label %assert_end17, label %assert_fail16, !dbg !18, !prof !19
assert_fail16: ; preds = %assert_end15
%61 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %61(i8* getelementptr inbounds ([124 x i8], [124 x i8]* @.str.6, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end17: ; preds = %assert_end15
%62 = getelementptr inbounds i64, i64* %24, i64 2, !dbg !18
%63 = load i64, i64* %62, align 8, !dbg !18, !tbaa !55
%64 = trunc i64 %63 to i32, !dbg !18
%65 = icmp eq i32 %64, 128, !dbg !18
br i1 %65, label %assert_end19, label %assert_fail18, !dbg !18, !prof !19
assert_fail18: ; preds = %assert_end17
%66 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %66(i8* getelementptr inbounds ([128 x i8], [128 x i8]* @.str.7, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end19: ; preds = %assert_end17
%67 = getelementptr inbounds i64, i64* %24, i64 3, !dbg !18
%68 = load i64, i64* %67, align 8, !dbg !18, !tbaa !58
%69 = trunc i64 %68 to i32, !dbg !18
%70 = icmp eq i32 %69, 128, !dbg !18
br i1 %70, label %assert_end21, label %assert_fail20, !dbg !18, !prof !19
assert_fail20: ; preds = %assert_end19
%71 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %71(i8* getelementptr inbounds ([128 x i8], [128 x i8]* @.str.8, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end21: ; preds = %assert_end19
%72 = icmp eq i64* %26, null, !dbg !18
br i1 %72, label %if_end, label %if_then, !dbg !18, !prof !60
if_then: ; preds = %assert_end21
%73 = load i64, i64* %26, align 8, !dbg !18, !tbaa !61
%74 = trunc i64 %73 to i32, !dbg !18
%75 = icmp eq i32 %74, 49152, !dbg !18
%76 = getelementptr inbounds i64, i64* %26, i64 1, !dbg !18
%77 = load i64, i64* %76, align 8, !dbg !18, !tbaa !75
%78 = trunc i64 %77 to i32, !dbg !18
%79 = icmp eq i32 %78, 16384, !dbg !18
%80 = getelementptr inbounds i64, i64* %26, i64 2, !dbg !18
%81 = load i64, i64* %80, align 8, !dbg !18, !tbaa !77
%82 = trunc i64 %81 to i32, !dbg !18
%83 = icmp eq i32 %82, 128, !dbg !18
%84 = getelementptr inbounds i64, i64* %26, i64 3, !dbg !18
%85 = load i64, i64* %84, align 8, !dbg !18, !tbaa !80
%86 = trunc i64 %85 to i32, !dbg !18
%87 = icmp eq i32 %86, 1, !dbg !18
%88 = and i1 %83, %87, !dbg !18
%89 = and i1 %79, %88, !dbg !18
%90 = and i1 %75, %89, !dbg !18
br i1 %90, label %if_end, label %assert_fail22, !dbg !18, !prof !19
if_end: ; preds = %assert_end21, %if_then
%91 = getelementptr inbounds %1, %1* %12, i64 0, i32 6, !dbg !18
%92 = load i64, i64* %91, align 8, !dbg !18
%93 = icmp eq i64 %92, 0, !dbg !18
br i1 %93, label %assert_end25, label %assert_fail24, !dbg !18, !prof !19
assert_fail22: ; preds = %if_then
%94 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %94(i8* getelementptr inbounds ([202 x i8], [202 x i8]* @.str.9, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_fail24: ; preds = %if_end
%95 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %95(i8* getelementptr inbounds ([163 x i8], [163 x i8]* @.str.10, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end25: ; preds = %if_end
%96 = getelementptr inbounds %1, %1* %12, i64 0, i32 1, i32 0, !dbg !18
%97 = load i32, i32* %96, align 4, !dbg !18
%98 = icmp eq i32 %97, 2, !dbg !18
br i1 %98, label %assert_end27, label %assert_fail26, !dbg !18, !prof !19
assert_fail26: ; preds = %assert_end25
%99 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %99(i8* getelementptr inbounds ([149 x i8], [149 x i8]* @.str.11, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end27: ; preds = %assert_end25
%100 = getelementptr inbounds %1, %1* %17, i64 0, i32 2, !dbg !18
%101 = load i32, i32* %100, align 4, !dbg !18
%102 = icmp eq i32 %101, 4, !dbg !18
br i1 %102, label %assert_end31, label %assert_fail28, !dbg !18, !prof !19
assert_fail28: ; preds = %assert_end27
%103 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %103(i8* getelementptr inbounds ([85 x i8], [85 x i8]* @.str.12, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end31: ; preds = %assert_end27
%104 = getelementptr inbounds %1, %1* %17, i64 0, i32 3, i32 2, !dbg !18
%105 = load i16, i16* %104, align 2, !dbg !18
%106 = icmp eq i16 %105, 1, !dbg !18
%107 = getelementptr inbounds %1, %1* %17, i64 0, i32 3, i32 1, !dbg !18
%108 = load i8, i8* %107, align 1, !dbg !18
%109 = icmp eq i8 %108, 32, !dbg !18
%110 = getelementptr inbounds %1, %1* %17, i64 0, i32 3, i32 0, !dbg !18
%111 = load i8, i8* %110, align 1, !dbg !18
%112 = icmp eq i8 %111, 2, !dbg !18
%113 = and i1 %109, %112, !dbg !18
%114 = and i1 %106, %113, !dbg !18
br i1 %114, label %assert_end33, label %assert_fail32, !dbg !18, !prof !19
assert_fail32: ; preds = %assert_end31
%115 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %115(i8* getelementptr inbounds ([198 x i8], [198 x i8]* @.str.13, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end33: ; preds = %assert_end31
%116 = load i64, i64* %32, align 8, !dbg !18, !tbaa !82
%117 = trunc i64 %116 to i32, !dbg !18
%118 = icmp eq i32 %117, 6, !dbg !18
br i1 %118, label %assert_end35, label %assert_fail34, !dbg !18, !prof !19
assert_fail34: ; preds = %assert_end33
%119 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %119(i8* getelementptr inbounds ([124 x i8], [124 x i8]* @.str.14, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end35: ; preds = %assert_end33
%120 = getelementptr inbounds i64, i64* %32, i64 1, !dbg !18
%121 = load i64, i64* %120, align 8, !dbg !18, !tbaa !96
%122 = trunc i64 %121 to i32, !dbg !18
%123 = icmp eq i32 %122, 6, !dbg !18
br i1 %123, label %assert_end37, label %assert_fail36, !dbg !18, !prof !19
assert_fail36: ; preds = %assert_end35
%124 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %124(i8* getelementptr inbounds ([124 x i8], [124 x i8]* @.str.15, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end37: ; preds = %assert_end35
%125 = getelementptr inbounds i64, i64* %32, i64 2, !dbg !18
%126 = load i64, i64* %125, align 8, !dbg !18, !tbaa !98
%127 = trunc i64 %126 to i32, !dbg !18
%128 = icmp eq i32 %127, 3, !dbg !18
br i1 %128, label %assert_end39, label %assert_fail38, !dbg !18, !prof !19
assert_fail38: ; preds = %assert_end37
%129 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %129(i8* getelementptr inbounds ([124 x i8], [124 x i8]* @.str.16, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end39: ; preds = %assert_end37
%130 = getelementptr inbounds i64, i64* %32, i64 3, !dbg !18
%131 = load i64, i64* %130, align 8, !dbg !18, !tbaa !101
%132 = trunc i64 %131 to i32, !dbg !18
%133 = icmp eq i32 %132, 5120, !dbg !18
br i1 %133, label %assert_end41, label %assert_fail40, !dbg !18, !prof !19
assert_fail40: ; preds = %assert_end39
%134 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %134(i8* getelementptr inbounds ([130 x i8], [130 x i8]* @.str.17, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end41: ; preds = %assert_end39
%135 = icmp eq i64* %34, null, !dbg !18
br i1 %135, label %if_end43, label %if_then42, !dbg !18, !prof !60
if_then42: ; preds = %assert_end41
%136 = load i64, i64* %34, align 8, !dbg !18, !tbaa !103
%137 = trunc i64 %136 to i32, !dbg !18
%138 = icmp eq i32 %137, 92160, !dbg !18
%139 = getelementptr inbounds i64, i64* %34, i64 1, !dbg !18
%140 = load i64, i64* %139, align 8, !dbg !18, !tbaa !117
%141 = trunc i64 %140 to i32, !dbg !18
%142 = icmp eq i32 %141, 15360, !dbg !18
%143 = getelementptr inbounds i64, i64* %34, i64 2, !dbg !18
%144 = load i64, i64* %143, align 8, !dbg !18, !tbaa !119
%145 = trunc i64 %144 to i32, !dbg !18
%146 = icmp eq i32 %145, 5120, !dbg !18
%147 = getelementptr inbounds i64, i64* %34, i64 3, !dbg !18
%148 = load i64, i64* %147, align 8, !dbg !18, !tbaa !122
%149 = trunc i64 %148 to i32, !dbg !18
%150 = icmp eq i32 %149, 1, !dbg !18
%151 = and i1 %146, %150, !dbg !18
%152 = and i1 %142, %151, !dbg !18
%153 = and i1 %138, %152, !dbg !18
br i1 %153, label %if_end43, label %assert_fail44, !dbg !18, !prof !19
if_end43: ; preds = %assert_end41, %if_then42
%154 = getelementptr inbounds %1, %1* %17, i64 0, i32 6, !dbg !18
%155 = load i64, i64* %154, align 8, !dbg !18
%156 = icmp eq i64 %155, 0, !dbg !18
br i1 %156, label %assert_end47, label %assert_fail46, !dbg !18, !prof !19
assert_fail44: ; preds = %if_then42
%157 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %157(i8* getelementptr inbounds ([203 x i8], [203 x i8]* @.str.18, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_fail46: ; preds = %if_end43
%158 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %158(i8* getelementptr inbounds ([163 x i8], [163 x i8]* @.str.19, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end47: ; preds = %if_end43
%159 = getelementptr inbounds %1, %1* %17, i64 0, i32 1, i32 0, !dbg !18
%160 = load i32, i32* %159, align 4, !dbg !18
%161 = icmp eq i32 %160, 2, !dbg !18
br i1 %161, label %assert_end49, label %assert_fail48, !dbg !18, !prof !19
assert_fail48: ; preds = %assert_end47
%162 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %162(i8* getelementptr inbounds ([149 x i8], [149 x i8]* @.str.20, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end49: ; preds = %assert_end47
%163 = getelementptr inbounds %1, %1* %17, i64 0, i32 1, i32 1, !dbg !18
%164 = load i32, i32* %163, align 4, !dbg !18
%165 = icmp eq i32 %28, %164, !dbg !18
br i1 %165, label %assert_end51, label %assert_fail50, !dbg !18, !prof !19
assert_fail50: ; preds = %assert_end49
%166 = load void (i8*)*, void (i8*)** @__TVMAPISetLastError, align 8, !dbg !18, !tbaa !20
tail call void %166(i8* getelementptr inbounds ([155 x i8], [155 x i8]* @.str.21, i64 0, i64 0)), !dbg !18
ret i32 -1, !dbg !18
assert_end51: ; preds = %assert_end49
%167 = bitcast [7 x %0]* %7 to i64*, !dbg !18
store i64 2, i64* %167, align 8, !dbg !18
store i32 0, i32* %.sub56, align 4, !dbg !18, !tbaa !124
%168 = sext i32 %28 to i64, !dbg !18
%169 = getelementptr inbounds [7 x %0], [7 x %0]* %7, i64 0, i64 1, !dbg !18
%170 = bitcast %0* %169 to i64*, !dbg !18
store i64 %168, i64* %170, align 8, !dbg !18
%171 = getelementptr inbounds [7 x i32], [7 x i32]* %8, i64 0, i64 1, !dbg !18
store i32 0, i32* %171, align 4, !dbg !18, !tbaa !138
%172 = load i8*, i8** @.tvm_func.__tvm_set_device, align 8, !dbg !18
%173 = icmp eq i8* %172, null, !dbg !18
br i1 %173, label %handle_init, label %handle_init_end, !dbg !18, !prof !60
handle_init: ; preds = %assert_end51
%174 = load i8*, i8** @__tvm_module_ctx, align 8, !dbg !18, !tbaa !20
%175 = load i32 (i8*, i8*, i8**)*, i32 (i8*, i8*, i8**)** @__TVMBackendGetFuncFromEnv, align 8, !dbg !18, !tbaa !20
%176 = call i32 %175(i8* %174, i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str.22, i64 0, i64 0), i8** nonnull %6), !dbg !18
%177 = icmp eq i32 %176, 0, !dbg !18
br i1 %177, label %call_end, label %call_fail, !dbg !18, !prof !19
handle_init_end: ; preds = %assert_end51, %call_end
%178 = phi i8* [ %172, %assert_end51 ], [ %184, %call_end ], !dbg !18
%179 = getelementptr inbounds [7 x %0], [7 x %0]* %7, i64 0, i64 2, !dbg !18
%180 = getelementptr inbounds [7 x i32], [7 x i32]* %8, i64 0, i64 2, !dbg !18
%181 = load i32 (i8*, %0*, i32*, i32, %0*, i32*)*, i32 (i8*, %0*, i32*, i32, %0*, i32*)** @__TVMFuncCall, align 8, !dbg !18, !tbaa !20
%182 = call i32 %181(i8* %178, %0* nonnull %.sub, i32* nonnull %.sub56, i32 2, %0* nonnull %179, i32* nonnull %180), !dbg !18
%183 = icmp eq i32 %182, 0, !dbg !18
br i1 %183, label %call_end53, label %call_fail, !dbg !18, !prof !19
call_fail: ; preds = %call_end53, %handle_init_end, %handle_init
%merge = phi i32 [ %176, %handle_init ], [ %182, %handle_init_end ], [ %185, %call_end53 ]
ret i32 %merge, !dbg !18
call_end: ; preds = %handle_init
%184 = load i8*, i8** %6, align 8, !dbg !18
store i8* %184, i8** @.tvm_func.__tvm_set_device, align 8, !dbg !18
br label %handle_init_end, !dbg !18
call_end53: ; preds = %handle_init_end
%185 = call fastcc i32 @winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_compute_(%0* nonnull %.sub, i8* %30, i32* nonnull %.sub56, i8* %22, i32 %28), !dbg !18
br label %call_fail
}
; Function Attrs: nounwind
declare void @llvm.assume(i1) #0
; Function Attrs: noinline
define internal fastcc i32 @winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_compute_(%0* noalias, i8* noalias align 128, i32* noalias, i8* noalias align 128, i32) unnamed_addr #1 {
entry:
%5 = alloca i8*, align 8
%6 = alloca i8*, align 8
%7 = alloca i8*, align 8
%8 = load i8* (i32, i32, i64, i32, i32)*, i8* (i32, i32, i64, i32, i32)** @__TVMBackendAllocWorkspace, align 8, !tbaa !20
%9 = tail call i8* %8(i32 2, i32 %4, i64 2211840, i32 2, i32 32)
%ptrint = ptrtoint i8* %9 to i64
%maskedptr = and i64 %ptrint, 127
%maskcond = icmp eq i64 %maskedptr, 0
tail call void @llvm.assume(i1 %maskcond)
%10 = icmp eq i8* %9, null
br i1 %10, label %if_then, label %if_end, !prof !19
if_then: ; preds = %call_end20, %if_end22, %handle_init_end16, %handle_init15, %handle_init_end10, %handle_init9, %handle_init_end, %handle_init, %if_end, %entry
%merge = phi i32 [ -1, %if_end ], [ -1, %entry ], [ %28, %handle_init ], [ %34, %handle_init_end ], [ %43, %handle_init9 ], [ %47, %handle_init_end10 ], [ %56, %handle_init15 ], [ %60, %handle_init_end16 ], [ -1, %call_end20 ], [ %., %if_end22 ]
ret i32 %merge
if_end: ; preds = %entry
%11 = load i8* (i32, i32, i64, i32, i32)*, i8* (i32, i32, i64, i32, i32)** @__TVMBackendAllocWorkspace, align 8, !tbaa !20
%12 = tail call i8* %11(i32 2, i32 %4, i64 983040, i32 2, i32 32)
%ptrint1 = ptrtoint i8* %12 to i64
%maskedptr2 = and i64 %ptrint1, 127
%maskcond3 = icmp eq i64 %maskedptr2, 0
tail call void @llvm.assume(i1 %maskcond3)
%13 = icmp eq i8* %12, null
br i1 %13, label %if_then, label %if_end5, !prof !19
if_end5: ; preds = %if_end
%14 = bitcast %0* %0 to i8**
store i8* %1, i8** %14, align 8
%15 = getelementptr inbounds %0, %0* %0, i64 1
%16 = bitcast %0* %15 to i8**
store i8* %9, i8** %16, align 8
%17 = getelementptr inbounds %0, %0* %0, i64 2
%18 = bitcast %0* %17 to <2 x i64>*
store <2 x i64> <i64 128, i64 240>, <2 x i64>* %18, align 8
%19 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> <i32 3, i32 3, i32 0, i32 0>, <4 x i32>* %19, align 4, !tbaa !140
%20 = getelementptr inbounds %0, %0* %0, i64 4
%21 = getelementptr inbounds i32, i32* %2, i64 4
store i32 0, i32* %21, align 4, !tbaa !141
%22 = bitcast %0* %20 to <2 x i64>*
store <2 x i64> <i64 240, i64 240>, <2 x i64>* %22, align 8
%23 = getelementptr inbounds i32, i32* %2, i64 5
store i32 0, i32* %23, align 4, !tbaa !145
%24 = load i8*, i8** @.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel0, align 8
%25 = icmp eq i8* %24, null
br i1 %25, label %handle_init, label %handle_init_end, !prof !60
handle_init: ; preds = %if_end5
%26 = load i8*, i8** @__tvm_module_ctx, align 8, !tbaa !20
%27 = load i32 (i8*, i8*, i8**)*, i32 (i8*, i8*, i8**)** @__TVMBackendGetFuncFromEnv, align 8, !tbaa !20
%28 = call i32 %27(i8* %26, i8* getelementptr inbounds ([96 x i8], [96 x i8]* @.str.23, i64 0, i64 0), i8** nonnull %7)
%29 = icmp eq i32 %28, 0
br i1 %29, label %call_end, label %if_then, !prof !19
handle_init_end: ; preds = %if_end5, %call_end
%30 = phi i8* [ %24, %if_end5 ], [ %36, %call_end ]
%31 = getelementptr inbounds %0, %0* %0, i64 6
%32 = getelementptr inbounds i32, i32* %2, i64 6
%33 = load i32 (i8*, %0*, i32*, i32, %0*, i32*)*, i32 (i8*, %0*, i32*, i32, %0*, i32*)** @__TVMFuncCall, align 8, !tbaa !20
%34 = call i32 %33(i8* %30, %0* nonnull %0, i32* nonnull %2, i32 6, %0* nonnull %31, i32* nonnull %32)
%35 = icmp eq i32 %34, 0
br i1 %35, label %call_end8, label %if_then, !prof !19
call_end: ; preds = %handle_init
%36 = load i8*, i8** %7, align 8
store i8* %36, i8** @.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel0, align 8
br label %handle_init_end
call_end8: ; preds = %handle_init_end
store i8* %12, i8** %14, align 8
store i8* %9, i8** %16, align 8
%37 = bitcast %0* %17 to <2 x i64>*
store <2 x i64> <i64 7680, i64 32>, <2 x i64>* %37, align 8
%38 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> <i32 3, i32 3, i32 0, i32 0>, <4 x i32>* %38, align 4, !tbaa !140
%39 = load i8*, i8** @.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel1, align 8
%40 = icmp eq i8* %39, null
br i1 %40, label %handle_init9, label %handle_init_end10, !prof !60
handle_init9: ; preds = %call_end8
%41 = load i8*, i8** @__tvm_module_ctx, align 8, !tbaa !20
%42 = load i32 (i8*, i8*, i8**)*, i32 (i8*, i8*, i8**)** @__TVMBackendGetFuncFromEnv, align 8, !tbaa !20
%43 = call i32 %42(i8* %41, i8* getelementptr inbounds ([96 x i8], [96 x i8]* @.str.24, i64 0, i64 0), i8** nonnull %6)
%44 = icmp eq i32 %43, 0
br i1 %44, label %call_end12, label %if_then, !prof !19
handle_init_end10: ; preds = %call_end8, %call_end12
%45 = phi i8* [ %39, %call_end8 ], [ %49, %call_end12 ]
%46 = load i32 (i8*, %0*, i32*, i32, %0*, i32*)*, i32 (i8*, %0*, i32*, i32, %0*, i32*)** @__TVMFuncCall, align 8, !tbaa !20
%47 = call i32 %46(i8* %45, %0* nonnull %0, i32* nonnull %2, i32 4, %0* nonnull %20, i32* nonnull %21)
%48 = icmp eq i32 %47, 0
br i1 %48, label %call_end14, label %if_then, !prof !19
call_end12: ; preds = %handle_init9
%49 = load i8*, i8** %6, align 8
store i8* %49, i8** @.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel1, align 8
br label %handle_init_end10
call_end14: ; preds = %handle_init_end10
store i8* %3, i8** %14, align 8
store i8* %12, i8** %16, align 8
%50 = bitcast %0* %17 to <2 x i64>*
store <2 x i64> <i64 7680, i64 32>, <2 x i64>* %50, align 8
%51 = bitcast i32* %2 to <4 x i32>*
store <4 x i32> <i32 3, i32 3, i32 0, i32 0>, <4 x i32>* %51, align 4, !tbaa !140
%52 = load i8*, i8** @.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel2, align 8
%53 = icmp eq i8* %52, null
br i1 %53, label %handle_init15, label %handle_init_end16, !prof !60
handle_init15: ; preds = %call_end14
%54 = load i8*, i8** @__tvm_module_ctx, align 8, !tbaa !20
%55 = load i32 (i8*, i8*, i8**)*, i32 (i8*, i8*, i8**)** @__TVMBackendGetFuncFromEnv, align 8, !tbaa !20
%56 = call i32 %55(i8* %54, i8* getelementptr inbounds ([96 x i8], [96 x i8]* @.str.25, i64 0, i64 0), i8** nonnull %5)
%57 = icmp eq i32 %56, 0
br i1 %57, label %call_end18, label %if_then, !prof !19
handle_init_end16: ; preds = %call_end14, %call_end18
%58 = phi i8* [ %52, %call_end14 ], [ %62, %call_end18 ]
%59 = load i32 (i8*, %0*, i32*, i32, %0*, i32*)*, i32 (i8*, %0*, i32*, i32, %0*, i32*)** @__TVMFuncCall, align 8, !tbaa !20
%60 = call i32 %59(i8* %58, %0* nonnull %0, i32* nonnull %2, i32 4, %0* nonnull %20, i32* nonnull %21)
%61 = icmp eq i32 %60, 0
br i1 %61, label %call_end20, label %if_then, !prof !19
call_end18: ; preds = %handle_init15
%62 = load i8*, i8** %5, align 8
store i8* %62, i8** @.tvm_func.winograd_convolution_op_float_i5_3_128_128_w64_3_3_3_o5_64_128_128_ws1_1_wd1_1_p1_1_op0_kernel2, align 8
br label %handle_init_end16
call_end20: ; preds = %handle_init_end16
%63 = load i32 (i32, i32, i8*)*, i32 (i32, i32, i8*)** @__TVMBackendFreeWorkspace, align 8, !tbaa !20
%64 = call i32 %63(i32 2, i32 %4, i8* nonnull %12)
%65 = icmp eq i32 %64, 0
br i1 %65, label %if_end22, label %if_then, !prof !60
if_end22: ; preds = %call_end20
%66 = load i32 (i32, i32, i8*)*, i32 (i32, i32, i8*)** @__TVMBackendFreeWorkspace, align 8, !tbaa !20
%67 = call i32 %66(i32 2, i32 %4, i8* nonnull %9)
%68 = icmp ne i32 %67, 0
%. = sext i1 %68 to i32
br label %if_then
}