But when using add & mul pass, we get the incorrect result.
We found some differences between using add & mul or not.
For example, below is part of the model.
If not using add & mul, the layers(Add / Quantize / Concatenation) would be integrated into one function (red box)
if using, the layers would be separated two functions (blue boxes)
We compare the final output of blue boxes and red box, they should be the same (byte to bye).
But some of them are different, the difference is always 1 (like -88 and -89 or -127 and -126…etc),
Do you mean using tflite runtime to get the output of these layer to identify red or blue which is correct?
In STM32, the model was shown as a byte array.
I don’t know how to get the specific layers’ output now. I’ll study for it.
According to the final result, does it mean the red box is correct?
This is subjective but TVM as an optimizing compiler should not produce something different to tflite.
Therefore depending on which boxes (red vs blue) produces a result matching to tflite, we’d need to fix the one that does not.
if input A is -109 and input B is -85, and we can calculate by hand and get the output is -83 (-83.491283)
Then we use the “arm_elementwise_add_s8”.
arm_elementwise_add_s8(input_0_, input_1_, 128, 1073741824, 0, 128, 1806905801, -1, 20, output_, -128, 1732166406, -19, -128, 127, 18432);
this is generated by tvm, and the output is -83 also (modify the last argument to 1)
Now we use the composite function, these are also generated by tvm.
###########################################################
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused = 0; ax0_ax1_fused_ax2_fused_ax3_fused < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused) {
((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused] = ((int32_t)placeholder[ax0_ax1_fused_ax2_fused_ax3_fused]);
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused1 = 0; ax0_ax1_fused_ax2_fused_ax3_fused1 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused1) {
((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused1] = (((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused1] - ((int32_t*)fused_cast_constant_0)[0]);
}
for (int32_t i0_i1_fused_i2_fused_i3_fused = 0; i0_i1_fused_i2_fused_i3_fused < 18432; ++i0_i1_fused_i2_fused_i3_fused) {
((int32_t*)T_cast)[i0_i1_fused_i2_fused_i3_fused] = ((int32_t)(((((0 != 0) ? (((int64_t)((int32_t*)T_cast)[i0_i1_fused_i2_fused_i3_fused]) << ((int64_t)0)) : ((int64_t)((int32_t*)T_cast)[i0_i1_fused_i2_fused_i3_fused])) * (int64_t)1732166406) + ((int64_t)1 << ((int64_t)((0 + 31) - 1)))) >> ((int64_t)(0 + 31))));
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused2 = 0; ax0_ax1_fused_ax2_fused_ax3_fused2 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused2) {
((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused2] = (((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused2] - 128);
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused3 = 0; ax0_ax1_fused_ax2_fused_ax3_fused3 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused3) {
((int32_t*)T_cast1)[ax0_ax1_fused_ax2_fused_ax3_fused3] = ((int32_t)placeholder1[ax0_ax1_fused_ax2_fused_ax3_fused3]);
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused4 = 0; ax0_ax1_fused_ax2_fused_ax3_fused4 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused4) {
((int32_t*)T_cast1)[ax0_ax1_fused_ax2_fused_ax3_fused4] = (((int32_t*)T_cast1)[ax0_ax1_fused_ax2_fused_ax3_fused4] - ((int32_t*)fused_cast_constant_0)[0]);
}
for (int32_t i0_i1_fused_i2_fused_i3_fused1 = 0; i0_i1_fused_i2_fused_i3_fused1 < 18432; ++i0_i1_fused_i2_fused_i3_fused1) {
((int32_t*)T_cast1)[i0_i1_fused_i2_fused_i3_fused1] = ((int32_t)(((((0 != 0) ? (((int64_t)((int32_t*)T_cast1)[i0_i1_fused_i2_fused_i3_fused1]) << ((int64_t)0)) : ((int64_t)((int32_t*)T_cast1)[i0_i1_fused_i2_fused_i3_fused1])) * (int64_t)1457455348) + ((int64_t)1 << ((int64_t)((0 + 31) - 1)))) >> ((int64_t)(0 + 31))));
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused5 = 0; ax0_ax1_fused_ax2_fused_ax3_fused5 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused5) {
((int32_t*)T_cast1)[ax0_ax1_fused_ax2_fused_ax3_fused5] = (((int32_t*)T_cast1)[ax0_ax1_fused_ax2_fused_ax3_fused5] - 128);
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused6 = 0; ax0_ax1_fused_ax2_fused_ax3_fused6 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused6) {
((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused6] = (((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused6] + ((int32_t*)T_cast1)[ax0_ax1_fused_ax2_fused_ax3_fused6]);
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused7 = 0; ax0_ax1_fused_ax2_fused_ax3_fused7 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused7) {
((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused7] = (((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused7] + 128);
}
for (int32_t i0_i1_fused_i2_fused_i3_fused2 = 0; i0_i1_fused_i2_fused_i3_fused2 < 18432; ++i0_i1_fused_i2_fused_i3_fused2) {
int32_t _1 = ((int32_t*)T_cast)[i0_i1_fused_i2_fused_i3_fused2];
int32_t _2 = (_1) < (127) ? (_1) : (127);
((int32_t*)T_cast)[i0_i1_fused_i2_fused_i3_fused2] = ((_2) > (-128) ? (_2) : (-128));
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused8 = 0; ax0_ax1_fused_ax2_fused_ax3_fused8 < 18432; ++ax0_ax1_fused_ax2_fused_ax3_fused8) {
((int8_t*)T_cast1)[ax0_ax1_fused_ax2_fused_ax3_fused8] = ((int8_t)((int32_t*)T_cast)[ax0_ax1_fused_ax2_fused_ax3_fused8]);
}
#########################################################################
"placeholder " and “placeholder1” are input A and input B buffers.
“T_cast” and “T_cast1” are temporary buffers.
fused_cast_constant_0)[0] is -0x80
the output is -84
these two ways have different outputs, and it seems the arm api has the correct output (compare with calculate by hand)
But the inference result of using composite function is correct, and using arm add api is incorrect.