I have one statement like below in my operator:
tvm::cast(Int(32), z_), where z_ is a float evaluated expression.
Above expression works fine in Target LLVM(CPU mode).
But when i compile for OpenCL target, the statement evaluate to blank, like below:
float _30 = ;
NOTE: Above statement is ouput of .so code generated post tvm.build
Can anyone help me solving this issue.
Can you post your code so we can reproduce?
@merrymercy: Thanks for your interest! I am sorry for not able to share the complete code here.
But i will try reproduce the same issue with some dummy code, and share you soon.
When i take index as output of a tensor, and use that index to access particular elements of another tensor, then this issue occurs.
Below is the test operator which will reproduce the issue:
inline tvm::Tensor test_operator(const tvm::Tensor& input,
const tvm::Tensor& input1,
const tvm::Tensor& input2,
std::string name = "test_operator",
std::string tag = kInjective) {
int h = topi::detail::GetConstInt(input2->shape[1]);
int w = topi::detail::GetConstInt(input2->shape[2]);
int bs = topi::detail::GetConstInt(input1->shape[0]);
int gd = topi::detail::GetConstInt(input1->shape[3]);
int coeffs_chans = topi::detail::GetConstInt(input1->shape[4]);
int input_chans = topi::detail::GetConstInt(input->shape[3]);
int output_chans = coeffs_chans / (input_chans+1);
tvm::Array<tvm::Expr> axes;
axes.push_back(tvm::make_const(tvm::Int(32), 0));
Tensor input_ft = cast(squeeze(topi::nn::flatten(input), axes), input->dtype);
Tensor input1_ft = cast(squeeze(topi::nn::flatten(input1), axes), input->dtype);
Tensor input2_ft = cast(squeeze(topi::nn::flatten(input2), axes), input->dtype);
int total_count = bs*h*w*output_chans;
auto test_operator_kernel = [&](const Array<Var>& indices) {
auto idx = indices[0];
auto x = tvm::cast(Int(32), (idx / output_chans)) % w;
auto gz = input2_ft[x]*gd;
auto coeff_sample = Expr(0.0f);
coeff_sample += input1_ft[tvm::cast(Int(32), gz)];
return coeff_sample*input_ft[input_chans];
};
Array<Expr> out_shape;
out_shape.push_back(total_count);
Tensor output = compute(out_shape, test_operator_kernel, name, tag);
Array <Expr> intermediate_shape = {bs, h, w, output_chans};
return reshape(output, intermediate_shape);
}
Opencl output:
------opencl code------
__kernel void myadd_kernel0(__global float* restrict tensor, __global float* restrict S, __global float* restrict G, __global float* restrict A) {
for (int ax1 = 0; ax1 < 1200; ++ax1) {
for (int ax2 = 0; ax2 < 1600; ++ax2) {
for (int ax3 = 0; ax3 < 3; ++ax3) {
float _1 = G[ax2];
float _2 = _1 * 8.000000e+00f;
int _3 = (int)_2;
float _4 = ; –> Here the line with empty RHS generated
float _5 = A[3];
float _6 = _4 * _5;
int _7 = ax1 * 1600;
int _8 = _7 + ax2;
int _9 = _8 * 3;
int _10 = _9 + ax3;
tensor[_10] = _6;
}
}
}
}
@merrymercy: i have posted the dummy operator which can reproduce the issue. You need to compile for “opencl” target. If face any issue in reproducing, please let me know. Thanks!
I am not familiar with C++ API. Can you provide a self-contained runnable c++ example?
I tried some testcase in python. I cannot get similar to yours. It works well.
import tvm
from tvm.contrib.util import get_lower_ir
n = 10
A = tvm.placeholder((n,), 'float32', 'A')
B = tvm.placeholder((n,), 'float32', 'B')
input_chans = 3
output_chans = 12
w = 20
gd = 0.2
def func(*indices):
idx = indices[0]
x = ((idx / output_chans) % w).astype('int32')
gz = B[x] * gd
coeff_sample = 0.0
coeff_sample += A[gz.astype('int32')]
return coeff_sample * A[input_chans]
C = tvm.compute((n, n), func)
s = tvm.create_schedule([C.op])
s[C].bind(s[C].op.axis[0], tvm.thread_axis("blockIdx.x"))
func = tvm.build(s, [A, B, C], 'opencl')
print(func.imported_modules[0].get_source())
output
__kernel void default_function_kernel0(__global float* restrict compute, __global float* restrict A, __global float* restrict B) {
for (int i1 = 0; i1 < 10; ++i1) {
compute[((((int)get_group_id(0)) * 10) + i1)] = (A[((int)(B[0] * 2.000000e-01f))] * A[3]);
}
}
If you are willing to debug, you can take a look at https://github.com/dmlc/tvm/blob/master/src/codegen/codegen_c.cc or https://github.com/dmlc/tvm/blob/master/src/codegen/codegen_opencl.cc
Below code will reproduce the issue.
I think the issue is with scheduler. If i use auto inline as done in default injective scheduler, then the issue will occur, but if i bind each op individually, then no problem.
import tvm
from tvm.contrib.util import get_lower_ir
import topi
n = 10
A = tvm.placeholder((1,n), 'float32', 'A')
B = tvm.placeholder((1,n), 'float32', 'B')
E = topi.cast(topi.squeeze(topi.nn.flatten(A), 0), A.dtype)
F = topi.cast(topi.squeeze(topi.nn.flatten(B), 0), B.dtype)
input_chans = 3
output_chans = 12
w = 20
gd = 0.2
def func(*indices):
idx = indices[0]
x = ((idx / output_chans) % w).astype('int32')
gz = F[x] * gd
coeff_sample = 0.0
coeff_sample += E[gz.astype('int32')]
return coeff_sample * E[input_chans]
C = tvm.compute((n, n), func)
with tvm.target.create("opencl"):
s = topi.generic.schedule_injective(C)
func = tvm.build(s, [A, B, C], 'opencl')
print(func.imported_modules[0].get_source())
@merrymercy: Thanks a lot for your quick fix, now the issue is resolved. You are awesome!!!