Hello.
When i create conv2d using TOPI causes problems in the OpenCL codegen process in mali gpu. it generates code such as vstore4 (float44).
__kernel void default_function_kernel1(__global float4* restrict kernel_vec, __global void* restrict P1) {
int4 _1 = (int4)(((((((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) / 576) * 2304) + (((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) % 576)))+(576*0), ((((((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) / 576) * 2304) + (((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) % 576)))+(576*1), ((((((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) / 576) * 2304) + (((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) % 576)))+(576*2), ((((((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) / 576) * 2304) + (((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) % 576)))+(576*3));
vstore4(((float44)(((__global float*)P1)[_1.s0],((__global float*)P1)[_1.s1],((__global float*)P1)[_1.s2],((__global float*)P1)[_1.s3])), 0, (__global float*)kernel_vec + ((((int)get_group_id(0)) * 1024) + (((int)get_local_id(0)) * 4)));
}
I think that code(float44) is not intended and When i running the module using the above code, the following error occurs.
File "zz.py", line 38, in <module>
mod(tvm_input,tvm_p1)
File "/home/firefly/Desktop/TVM/tvm/python/tvm/runtime/module.py", line 110, in __call__
return self.entry_func(*args)
File "/home/firefly/Desktop/TVM/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 219, in __call__
raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
[bt] (1) /home/firefly/Desktop/TVM/tvm/build/libtvm.so(TVMFuncCall+0x70) [0x7fa4ae46c0]
[bt] (0) /home/firefly/Desktop/TVM/tvm/build/libtvm.so(+0x1462080) [0x7fa4af6080]
^
vstore4(((float44)(((__global float*)P1)[_1.s0],((__global float*)P1)[_1.s1],((__global float*)P1)[_1.s2],((__global float*)P1)[_1.s3])), 0, (__global float*)kernel_vec + ((((int)get_group_id(0)) * 1024) + (((int)get_local_id(0)) * 4)));
File "/home/firefly/Desktop/TVM/tvm/src/runtime/opencl/opencl_module.cc", line 234
File "/home/firefly/Desktop/TVM/tvm/src/runtime/library_module.cc", line 89
TVMError: Check failed: ret == 0 (-1 vs. 0) : OpenCL build error for device=0x7fa36117f0<source>:3:13: error: use of undeclared identifier 'float44'
error: Compiler frontend failed (error code 59)
terminate called after throwing an instance of 'dmlc::Error'
what(): [12:18:56] /home/firefly/Desktop/TVM/tvm/src/runtime/workspace_pool.cc:115: Check failed: allocated_.size() == 1 (4 vs. 1) :
Stack trace:
[bt] (0) /home/firefly/Desktop/TVM/tvm/build/libtvm.so(tvm::runtime::WorkspacePool::Pool::Release(DLContext, tvm::runtime::DeviceAPI*)+0x4d0) [0x7fa4b34d08]
[bt] (1) /home/firefly/Desktop/TVM/tvm/build/libtvm.so(tvm::runtime::WorkspacePool::~WorkspacePool()+0x48) [0x7fa4b336f8]
[bt] (2) /home/firefly/Desktop/TVM/tvm/build/libtvm.so(tvm::runtime::cl::OpenCLThreadEntry::~OpenCLThreadEntry()+0x18) [0x7fa4b74650]
[bt] (3) /lib/aarch64-linux-gnu/libc.so.6(__call_tls_dtors+0x48) [0x7fa953e620]
The problem is that opencl code is generated normally in the case of the old version, but the current version gives the above error.
I think the problem is occured when TVM generates OpenCL code using Mali GPU. Is this an internal TVM issue?
In addition, the above error occurs when running the code below in the current version of TVM.
import tvm
import topi
import numpy as np
target = tvm.target.create('opencl -device=mali')
target_host = 'llvm -target=aarch64-linux-gnu'
ctx = tvm.runtime.opencl(0)
dtype='float32'
## Setting shape
input_size = (1,64,224,224)
p1_size = (64,64,3,3)
## Make Placeholder
input_data = tvm.te.placeholder( shape = input_size , dtype = "float32", name="Input" )
param1 = tvm.te.placeholder( shape= p1_size , dtype = "float32", name="P1" )
## Build Module
with tvm.target.mali():
conv = topi.mali.conv2d_nchw_spatial_pack( input_data
,param1
,[1,1]
,[1,1,1,1]
,[1,1]
,"float32" )
sch = topi.mali.schedule_conv2d_nchw_spatial_pack([conv])
mod = tvm.build(sch, [input_data,param1] , target, target_host)
data = np.random.uniform(-1,1, size=input_size ).astype("float32")
p1 = np.random.uniform(-1,1,size=p1_size ).astype("float32")
tvm_input = tvm.nd.array( data , ctx )
tvm_p1 = tvm.nd.array( p1, ctx )
## Running
mod(tvm_input,tvm_p1)
ctx.sync()