Hi, I am using LLVM ARM intrinsics. I use “llvm.aarch64.neon.fmulx.v4f32” to multiply 4 float32 elements, but some problems occurs. Here is my code
import tvm
def fmull( dtype='float32'):
num_f32_elements=4
A=tvm.placeholder((num_f32_elements,), dtype =dtype, name='A' )
B=tvm.placeholder((num_f32_elements,), dtype =dtype, name='B' )
C=tvm.compute((num_f32_elements,),
lambda i: A[i].astype('float32')*
B[i].astype('float32'),
name='C')
a_buffer=tvm.decl_buffer(A.shape, dtype=dtype,name ='a_buffer',offset_factor=1)
b_buffer=tvm.decl_buffer(B.shape, dtype=dtype,name ='b_buffer',offset_factor=1)
c_buffer=tvm.decl_buffer(C.shape, dtype=C.dtype,name ='c_buffer',offset_factor=1)
def _intrin_func(ins,outs):
def _instr(index):
xx,yy=ins
zz=outs[0]
ib=tvm.ir_builder.create()
if index==1:
ib.emit(outs[0].vstore(0,tvm.const(0,'float32x4')))
return ib.get()
vec_x=xx.vload([0],dtype='float32x4')
vec_y=yy.vload([0],dtype='float32x4')
vec_z=zz.vload([0],dtype='float32x4')
inst='llvm.aarch64.neon.fmulx.v4f32'
fmulx=tvm.call_llvm_intrin('float32x4',
inst,
tvm.const(0,'uint32'),
vec_z,vec_x,vec_y)
ib.emit(zz.vstore(0,fmulx))
return ib.get()
#body reset update
return _instr(0), _instr(1), _instr(2)
with tvm.build_config(offset_factor=1,partition_const_loop=True):
return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:a_buffer,
B:b_buffer, C:c_buffer})
def cal():
dtype='float32'
num_f32_elements=64
A=tvm.placeholder((num_f32_elements,), dtype =dtype, name='A' )
B=tvm.placeholder((num_f32_elements,), dtype =dtype, name='B' )
C=tvm.compute((num_f32_elements,),
lambda i: A[i].astype('float32')*
B[i].astype('float32'),
name='C')
s=tvm.create_schedule(C.op)
x0,xi=s[C].split(C.op.axis[0],factor=4)
intrin=fmull(dtype='float32')
s[C].tensorize(xi,intrin)
target = 'llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon'
#target = 'llvm -device=arm_cpu -model=bcm2837 -target=arm64-none-linux-gnu -mattr=+neon'
print(tvm.lower(s,[A,B,C],simple_mode=True))
print("start to build module")
func=tvm.build(s,[A,B,C],target=target,name='element-wise')
assembly = func.get_source('asm')
print(assembly)
cal()
Segmentation fault occurs when running tvm.build. I am confused and I wonder whether my description of _intrin_func is right.