Performance difference between relay op and handwriting op

I am using tvm 0.19.0,and now I have an operator to be implemented in relay. Here is the original operator which is written in CPP and the new relay op I translated from.

CPP operator:

template <typename Scalar_I, typename Scalar_O>
class TsRank {
public:
void calc(Eigen::Ref<RowMatrix<Scalar_I>> arr, int64_t window, Eigen::Ref<RowMatrix<Scalar_O>> arrO) {
    if(window == 0) {
        arrO.setConstant(1);
        return;
    }
    int rows = arr.rows();
    int cols = arr.cols();
    for(int j = window - 1; j < cols; ++j) {
        for(int i = 0; i < rows; ++i) {
            if (std::isnan(arr(i, j))) {
                arrO(i, j) = NAN;
                continue;
            }
            int64_t sum = 0;
            for(int k = j + 1 - window; k < j + 1; ++k) {
                if(arr(i, k) <= arr(i, j)) {
                    sum++;
                }
            }
            arrO(i, j) = sum / (double)window;
        }
    }

    arrO.block(0, 0, rows, std::min(window - 1, (int64_t)cols)).setConstant(NAN);
}

Relay op I wrote

def my_ts_rank(data, period):
    """Compute the rank value of the elements along each window sliding along each row

    Parameters
    ----------
    data : tvm.te.Tensor
        The input tensor.

    Returns
    -------
    """
    assert len(data.shape) == 2
    B, N = data.shape
    window_size = period
    def valid(k, j):
        return tvm.tir.all(j >= window_size - 1)
    
    k = te.reduce_axis((0, window_size), name="k")
    
    def rank_body(i, j):
        idx = j + 1 - window_size + k
        val = data[i, j]
        return te.if_then_else(
            valid(idx, j),
            te.if_then_else(
                tvm.tir.isnan(data[i, idx]),
                tvm.tir.const(0.0, "float64"),
                te.if_then_else(
                    data[i, idx] <= val,
                    tvm.tir.const(1.0 / window_size, "float64"),
                    tvm.tir.const(0.0, "float64")
                )
            ),
            tvm.tir.const(0.0, "float64")
        )
    sum_tensor = te.compute(
        (B, N),
        lambda i, j: te.sum(rank_body(i, j), axis=k),
        name="rank_sum"
    )
    def final_rank(i, j):
        val = data[i, j]
        return te.if_then_else(
            tvm.tir.any(window_size == 0),
            tvm.tir.const(1.0, "float64"),
            te.if_then_else(
                tvm.tir.any(j < window_size - 1),
                nan64,
                te.if_then_else(
                    tvm.tir.isnan(val),
                    nan64,
                    sum_tensor[i, j]
                )
            )
        )
    
    return te.compute(data.shape, final_rank, name="rank_out")

The problem is that the relay op I wrote is poor, cost as 4 times duration than the original one.

My question

The te.sum is only allowed to be appeared in the top level of the expression, which forced me to create another Tensor by sum_tensor = te.compute(..., te.sum(...)), I think this restriction caused the performance to deteriorate. Is there any suggestions to make it better?