Hi there, the code is kinda complex so I just post some snippets down here. Please feel free to follow up if you have any questions!
C code:
// Use the `cnpy` library to load npy files in C
cnpy::NpyArray input_1_npy = cnpy::npy_load(input_1_name);
cnpy::NpyArray filter_1_npy = cnpy::npy_load(filter_1_name);
// Save the generated code with .asm format in Python: `code = func.get_source("asm")` and write to file.
tvm::runtime::Module mod = tvm::runtime::Module::LoadFromFile("kernel.so");
tvm::runtime::PackedFunc func = mod.GetFunction("func");
DLTensor *input, *filter, *output;
int dtype_code = kDLFloat;
int dtype_bits = 32;
int dtype_lanes = 1;
int device_type = kDLCPU;
int device_id = 0;
/**
* Define input_batch, input_channel, vlen1, vlen2, etc here
**/
int64_t input_shape_tuple[5] = {input_batch, int64_t(std::ceil(input_channel / vlen1)), input_height, input_width, vlen1};
int64_t filter_shape_tuple[6] = {oc_chunk, ic_chunk, filter_height, filter_width, ic, oc};
int64_t output_shape_tuple[5] = {output_batch, int64_t(std::ceil(filter_out_channel / vlen2)), output_height, output_width, vlen2};
TVMArrayAlloc(input_shape_tuple, 5, dtype_code, dtype_bits, dtype_lanes,
device_type, device_id, &input);
TVMArrayAlloc(filter_shape_tuple, 6, dtype_code, dtype_bits, dtype_lanes,
device_type, device_id, &filter);
TVMArrayAlloc(output_shape_tuple, 5, dtype_code, dtype_bits, dtype_lanes,
device_type, device_id, &output);
memcpy(input->data, input_npy.data<float>(), input_batch * input_height * input_width * input_channel * sizeof(float));
memcpy(filter->data, filter_npy.data<float>(), filter_height * filter_width * filter_in_channel * filter_out_channel * sizeof(float));
// Be careful of the order of tensor arguments
func(input, filter, output);
Makefile:
CXX := icpx
ASXX := as
TARGET := foobar
CXXFLAGS := -std=c++14 -O2 -fPIC \
-I${TVM_HOME}/include \
-I${TVM_HOME}/3rdparty/dlpack/include \
-I${DMLC_CORE}/include
LDFLAGS := \
-L${TVM_HOME}/build \
-L/usr/local/lib \
-ltvm_runtime -ldl -lpthread -lcnpy -lz
ASFLAGS ?= -march=corei7+fma+avx+avx2+sse3+avx512f+avx512cd+avx512vl+avx512dq # Put all potentially needed extensions here
KERNEL_ASM ?= func.asm
all: $(TARGET).cpp
$(ASXX) $(ASFLAGS) $(KERNEL_ASM) -o kernel.o
$(CXX) kernel.o -shared -fPIC -o kernel.so
$(CXX) $(CXXFLAGS) \
$(TARGET).cpp \
-o $(TARGET) \
$(LDFLAGS)
.phony: clean
clean:
rm $(TARGET) kernel.o kernel.so || echo -n ""