hi, everyone~ I’m working for adding a new backend to generate C code which includes RISCV RVV intrinsic. But I have some confusion about TIR. It is mainly about how to change the array allocation method and user-defined data type.It would be grateful if you could give me any advice.
The following is a detailed description of my problem.
1. Code of the case
compute and schedule code
# compute
N, M, L = 12, 20, 38
packw_bn = 4
packw_shape = (M // packw_bn, L, packw_bn)
out_dtype = "float32"
idxdiv = tvm.tir.indexdiv
idxmod = tvm.tir.indexmod
A = te.placeholder((N, L), name="A")
B = te.placeholder(packw_shape, name="B")
k = te.reduce_axis((0, L), name="k")
C = te.compute(
(N, M),
lambda y, x: te.sum(
A[y, k].astype(out_dtype)
* B[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
axis=k,
),
tag="dense_pack",
)
# schedule
s = te.create_schedule(C.op)
tile_y = SplitEntity([5, 1, 4])
tile_x = SplitEntity([3, 1, 4])
tile_k = SplitEntity([38, 1])
tile_inner = SplitEntity([5, 4])
CC = s.cache_write(C, "global")
y, x = s[C].op.axis
(k,) = s[CC].op.reduce_axis
yt, yo, yi = tile_y.apply(s, C, y)
xt, xo, xi = tile_x.apply(s, C, x)
factor = tile_x.size[-1]
s[C].reorder(xt, yt, yo, xo, yi, xi)
xyt = s[C].fuse(xt, yt)
xyo = s[C].fuse(yo, xo)
s[C].unroll(yi)
s[C].vectorize(xi)
# load = intrin_load(factor)
# s[C].tensorize(xi, load)
s[CC].compute_at(s[C], xyo)
y, x = s[CC].op.axis
ko, ki = tile_k.apply(s, CC, k)
s[CC].reorder(ko, ki, y, x)
s[CC].vectorize(x)
# macc = intrin_macc_vf(factor)
# s[CC].tensorize(x, macc)
tile_inner = tile_inner.size[-1]
yo, yi = s[CC].split(y, tile_inner)
s[CC].reorder(ko, yo, ki, yi, x)
s[CC].unroll(yo)
s[CC].unroll(ki)
s[CC].unroll(yi)
The code above is the compute and schedule of a packed gemm.
2.Original generated C code and target generated C code
original generated c code
float4 compute_global[4];
for (int32_t x_outer_y_outer_fused = 0; x_outer_y_outer_fused < 15; ++x_outer_y_outer_fused) {
int32_t cse_var_1 = (((x_outer_y_outer_fused % 3) * 80) + ((x_outer_y_outer_fused / 3) * 4));
compute_global[0] = ((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f));
compute_global[1] = ((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f));
compute_global[2] = ((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f));
compute_global[3] = ((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f));
for (int32_t k_outer = 0; k_outer < 38; ++k_outer) {
int32_t cse_var_3 = (((x_outer_y_outer_fused % 3) * 152) + k_outer);
int32_t cse_var_2 = (((x_outer_y_outer_fused / 3) * 152) + (k_outer * 4));
int32_t4 _1 = (int4)((cse_var_2)+(1*0), (cse_var_2)+(1*1), (cse_var_2)+(1*2), (cse_var_2)+(1*3));
compute_global[0] = (compute_global[0] + (((float4)(((float*)A)[cse_var_3], ((float*)A)[cse_var_3], ((float*)A)[cse_var_3], ((float*)A)[cse_var_3])) * ((float4)(((float*)B)[_1.s0],((float*)B)[_1.s1],((float*)B)[_1.s2],((float*)B)[_1.s3]))));
compute_global[1] = (compute_global[1] + (((float4)(((float*)A)[(cse_var_3 + 38)], ((float*)A)[(cse_var_3 + 38)], ((float*)A)[(cse_var_3 + 38)], ((float*)A)[(cse_var_3 + 38)])) * ((float4)(((float*)B)[_1.s0],((float*)B)[_1.s1],((float*)B)[_1.s2],((float*)B)[_1.s3]))));
compute_global[2] = (compute_global[2] + (((float4)(((float*)A)[(cse_var_3 + 76)], ((float*)A)[(cse_var_3 + 76)], ((float*)A)[(cse_var_3 + 76)], ((float*)A)[(cse_var_3 + 76)])) * ((float4)(((float*)B)[_1.s0],((float*)B)[_1.s1],((float*)B)[_1.s2],((float*)B)[_1.s3]))));
compute_global[3] = (compute_global[3] + (((float4)(((float*)A)[(cse_var_3 + 114)], ((float*)A)[(cse_var_3 + 114)], ((float*)A)[(cse_var_3 + 114)], ((float*)A)[(cse_var_3 + 114)])) * ((float4)(((float*)B)[_1.s0],((float*)B)[_1.s1],((float*)B)[_1.s2],((float*)B)[_1.s3]))));
}
*(float4*)(((float*)compute) + cse_var_1) = compute_global[0];
*(float4*)(((float*)compute) + (cse_var_1 + 20)) = compute_global[1];
*(float4*)(((float*)compute) + (cse_var_1 + 40)) = compute_global[2];
*(float4*)(((float*)compute) + (cse_var_1 + 60)) = compute_global[3];
}
My goal is to generate C code with RVV intrinsics by replacing vectorize
with tensorize
containing intrinsics.
My goal results are as following:
target generated c code
vfloat32m1_t compute_global_0, compute_global_4, compute_global_8, compute_global_12;
for (int32_t x_outer_y_outer_fused = 0; x_outer_y_outer_fused < 15; ++x_outer_y_outer_fused) {
int32_t cse_var_1 = (((x_outer_y_outer_fused % 3) * 80) + ((x_outer_y_outer_fused / 3) * 4));
compute_global_0 = vfmv_v_f_f32m1(0.000000e+00f, 4);
compute_global_4 = vfmv_v_f_f32m1(0.000000e+00f, 4);
compute_global_8 = vfmv_v_f_f32m1(0.000000e+00f, 4);
compute_global_12 = vfmv_v_f_f32m1(0.000000e+00f, 4);
for (int32_t k_outer = 0; k_outer < 38; ++k_outer) {
int32_t cse_var_3 = (((x_outer_y_outer_fused % 3) * 152) + k_outer);
int32_t cse_var_2 = (((x_outer_y_outer_fused / 3) * 152) + (k_outer * 4));
vfloat32m1_t vec = vle32_v_f32m1((&(((float*)B)[cse_var_2])), 4);
compute_global_0 = vfmacc_vf_f32m1(compute_global_0, ((float*)A)[cse_var_3], vec, 4);
vfloat32m1_t vec1 = vle32_v_f32m1((&(((float*)B)[cse_var_2])), 4);
compute_global_4 = vfmacc_vf_f32m1(compute_global_4, ((float*)A)[(cse_var_3 + 38)], vec1, 4);
vfloat32m1_t vec2 = vle32_v_f32m1((&(((float*)B)[cse_var_2])), 4);
compute_global_8 = vfmacc_vf_f32m1(compute_global_8, ((float*)A)[(cse_var_3 + 76)], vec2, 4);
vfloat32m1_t vec3 = vle32_v_f32m1((&(((float*)B)[cse_var_2])), 4);
compute_global_12 = vfmacc_vf_f32m1(compute_global_12, ((float*)A)[(cse_var_3 + 114)], vec3, 4);
}
vse32_v_f32m1((&(((float*)compute)[cse_var_1])), compute_global_0, 4);
vse32_v_f32m1((&(((float*)compute)[(cse_var_1 + 20)])), compute_global_4, 4);
vse32_v_f32m1((&(((float*)compute)[(cse_var_1 + 40)])), compute_global_8, 4);
vse32_v_f32m1((&(((float*)compute)[(cse_var_1 + 60)])), compute_global_12, 4);
}
The tensorize
function here as following:
register custom dtype
tvm.target.datatype.register("vfloat32m1_t", 129)
tvm.target.datatype.register_op(
tvm.target.datatype.lower_call_pure_extern,
"Call",
"c",
"vfloat32m1_t",
intrinsic_name="tir.call_extern",
)
intrin_macc_vf
def intrin_macc_vf(l):
"""match math sum(a[k] * b[0], axis=k)"""
a = te.placeholder((l,), name="a")
b = te.placeholder((1,), name="b")
k = te.reduce_axis((0, l), name="k")
c = te.compute((1,), lambda i: te.sum(a[k] * b[0], axis=k), name="macc")
Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1")])
Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])
dtype_vec = "float32"
vl = te.const(l, "int32")
def intrin_func(ins, outs):
aa, bb = ins
cc = outs[0]
vec_c = cc.vload([0], dtype_vec)
def _body():
ib = tvm.tir.ir_builder.create()
a = aa.vload([0], dtype_vec)
vec_b = bb.access_ptr("r")
# d = a * b + c
# load vec_b
load_vecb = tvm.tir.call_extern("custom[vfloat32m1_t]128", "vle32_v_f32m1", vec_b, vl)
load_vecb = ib.let("vec", load_vecb)
# do macc
vmmla = tvm.tir.call_pure_extern(
"float32", "vfmacc_vf_f32m1", vec_c, a, load_vecb, vl
)
ib.emit(cc.vstore([0], vmmla))
return ib.get()
def _reduce_reset():
ib = tvm.tir.ir_builder.create()
init_value = te.const(0, dtype_vec)
intrin_load = tvm.tir.call_extern("float32", "vfmv_v_f_f32m1", init_value, l)
ib.emit(cc.vstore([0], intrin_load))
return ib.get()
def _reduce_update():
return _body()
return _body(), _reduce_reset(), _reduce_update()
return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
intrin_load
def intrin_load(l):
"""match math load a[i]"""
a = te.placeholder((l,), name="a")
c = te.compute((l,), lambda i: a[i], name="load")
Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1])
Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1])
dtype_vec = "float32"
def intrin_func(ins, outs):
aa = ins[0]
cc = outs[0]
ib = tvm.tir.ir_builder.create()
vec_a = aa.vload([0], dtype_vec)
vec_c = cc.access_ptr("r")
vl = te.const(l, "int32")
ib.emit(tvm.tir.call_extern("float32", "vse32_v_f32m1", vec_c, vec_a, vl))
return ib.get()
return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, c: Cb})
3. My question
3.1 How to use TIR to convert memory allocation of vector array to allocate single vector .
In original generated c code, we use float4 compute_global[4]
to save Intermediate results.
and in target generated c code, because compiler does not support vector types as arrays, we need 4 vector value vfloat32m1_t compute_global_0, compute_global_4, compute_global_8, compute_global_12
to save the results. my question is how to use TIR to describe this situation.
3.2 How to use custom dtype
My second one is caused by the first question. because we can’t init array as custom dtype,when we use data in array in next code, we can’t load it as custom dtype.
def _reduce_reset():
ib = tvm.tir.ir_builder.create()
init_value = te.const(0, dtype_vec)
intrin_load = tvm.tir.call_extern("float32", "vfmv_v_f_f32m1", init_value, l)
ib.emit(cc.vstore([0], intrin_load))
return ib.get()
Like reduce reset code initialization in above, the return value type of function vfmv_v_f_f32m1
is vfloat32m1_t
. because of question 3.1, we can’t use the right dtype here.
It would be grateful if you could give me any advice .