I am using tvm 0.19.0,and now I have an operator to be implemented in relay. Here is the original operator which is written in CPP and the new relay op I translated from.
CPP operator:
template <typename Scalar_I, typename Scalar_O>
class TsRank {
public:
void calc(Eigen::Ref<RowMatrix<Scalar_I>> arr, int64_t window, Eigen::Ref<RowMatrix<Scalar_O>> arrO) {
if(window == 0) {
arrO.setConstant(1);
return;
}
int rows = arr.rows();
int cols = arr.cols();
for(int j = window - 1; j < cols; ++j) {
for(int i = 0; i < rows; ++i) {
if (std::isnan(arr(i, j))) {
arrO(i, j) = NAN;
continue;
}
int64_t sum = 0;
for(int k = j + 1 - window; k < j + 1; ++k) {
if(arr(i, k) <= arr(i, j)) {
sum++;
}
}
arrO(i, j) = sum / (double)window;
}
}
arrO.block(0, 0, rows, std::min(window - 1, (int64_t)cols)).setConstant(NAN);
}
Relay op I wrote
def my_ts_rank(data, period):
"""Compute the rank value of the elements along each window sliding along each row
Parameters
----------
data : tvm.te.Tensor
The input tensor.
Returns
-------
"""
assert len(data.shape) == 2
B, N = data.shape
window_size = period
def valid(k, j):
return tvm.tir.all(j >= window_size - 1)
k = te.reduce_axis((0, window_size), name="k")
def rank_body(i, j):
idx = j + 1 - window_size + k
val = data[i, j]
return te.if_then_else(
valid(idx, j),
te.if_then_else(
tvm.tir.isnan(data[i, idx]),
tvm.tir.const(0.0, "float64"),
te.if_then_else(
data[i, idx] <= val,
tvm.tir.const(1.0 / window_size, "float64"),
tvm.tir.const(0.0, "float64")
)
),
tvm.tir.const(0.0, "float64")
)
sum_tensor = te.compute(
(B, N),
lambda i, j: te.sum(rank_body(i, j), axis=k),
name="rank_sum"
)
def final_rank(i, j):
val = data[i, j]
return te.if_then_else(
tvm.tir.any(window_size == 0),
tvm.tir.const(1.0, "float64"),
te.if_then_else(
tvm.tir.any(j < window_size - 1),
nan64,
te.if_then_else(
tvm.tir.isnan(val),
nan64,
sum_tensor[i, j]
)
)
)
return te.compute(data.shape, final_rank, name="rank_out")
The problem is that the relay op I wrote is poor, cost as 4 times duration than the original one.
My question
The te.sum
is only allowed to be appeared in the top level of the expression, which forced me to create another Tensor by sum_tensor = te.compute(..., te.sum(...))
, I think this restriction caused the performance to deteriorate. Is there any suggestions to make it better?