How can I call the assembly function with DLTensor directly?

Hi, my friends.

First, I generate assembly code by Module.get_source(“asm”).
then I compile the asm code into a *.o file and link to a main function like this: [this .o file can be load by tvm.runtime.load_module and run ]
#include "tvm/runtime/c_runtime_api.h"
#include "tvm/runtime/c_backend_api.h"
#include "dlpack/dlpack.h"
extern "C" TVM_DLL int32_t default_function(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* out_ret_tcode, void* resource_handle);

int main()

{
     float *a = new float[1024*1024];
        for(uint32_t i=0; i<1024*1024; i++) {
           a[i] = i;
        }
        float *b = new float[1024*1024];
        int64_t shapeCommon[] = {1024,1024};
        int64_t strideCommon[] = {1024, 1};
        DLTensor input1;
        input1.data = (void*)a;
        input1.ctx = {kDLCPU, 0};
        input1.ndim = 2;
        input1.dtype = {2, 32, 1};
        input1.shape = shapeCommon;
        input1.strides = strideCommon;
        input1.byte_offset = 0;
        DLTensor input2;
        input2.data = (void*)a;
        input2.ctx = {kDLCPU, 0};
        input2.ndim = 2;
        input2.dtype = {2, 32, 1};
        input2.shape = shapeCommon;
        input2.strides = strideCommon;
        input2.byte_offset = 0;

        DLTensor output;
        output.data = (void*)b;
        output.ctx = {kDLCPU, 0};
        output.ndim = 2;
        output.dtype = {2, 32, 1};
        output.shape = shapeCommon;
        output.strides = strideCommon;
        output.byte_offset = 0;

      void* args[] = {(void*)(&input1), (void*)(&input2), (void*)(&output)};
      int32_t arg_type_ids[] = {kTVMDLTensorHandle,kTVMDLTensorHandle,kTVMDLTensorHandle};
      default_function(args, arg_type_ids, 3, nullptr, nullptr, nullptr);
}
It alway fails with Segmentation fault;
I’m sure that tensor shape is correct.
How can I call the asm function without PackedFunc and libtvm.so?
what’s the function prototype of the assembly func?