[BYOC] How can I use Arm Compute Library Integration flow for QNN?

Ryan · January 15, 2021, 6:27pm

Hi all,

This is my TVM 0.7 version

commit 728b829575e5e690870b111ae2256cbe0f3dbe6f Author: ziheng ziheng@apache.org Date: Fri Oct 2 09:45:23 2020 -0700
[RELEASE] Update NEWS.md for v0.7 (#6613)

Example and source code from https://tvm.apache.org/docs/deploy/arm_compute_lib.html https://tvm.apache.org/docs/tutorials/frontend/from_tflite.html

my code :

...
# from from_tflite.html
tflite_model_file = os.path.join(model_dir, "mobilenet_v1_1.0_224_quant.tflite") 
tflite_model_buf = open(tflite_model_file, "rb").read()
mod, params = relay.frontend.from_tflite(tflite_model_file, ....)
print(mod.astext())
# from arm_compute_lib.html
from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
mod = partition_for_arm_compute_lib(mod)
print(mod.astext())
...

pattern :

# in arm_compute_lib.py
    def qnn_conv_pattern():
        pattern = is_op("nn.pad")(wildcard()) | wildcard()
        pattern = is_op("qnn.conv2d")(
            pattern, is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
        )
        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
        pattern = pattern.optional(is_op("nn.relu"))
        pattern = is_op("qnn.requantize")(
            pattern, wildcard(), wildcard(), is_constant(), is_constant()
        )
        return pattern

    def avg_pool2d_pattern():
        pattern = is_op("cast")(wildcard())
        pattern = is_op("nn.avg_pool2d")(pattern) | is_op("nn.global_avg_pool2d")(pattern)
        pattern = is_op("cast")(pattern)
        return pattern
...
_register_external_op_helper("reshape")
...

This is relay original IR

def @main(...)
{
....
%78 = qnn.conv2d(%77, %v_param_53, 0 /* ty=int32 */, 95 /* ty=int32 */, 0.0235285f /* ty=float32 */, 0.0180482f /* ty=float32 */, padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1], data_layout="NHWC", kernel_layout="HWIO", out_dtype="int32") /* ty=Tensor[(1, 7, 7, 1024), int32] */;
%79 = nn.bias_add(%78, %v_param_54, axis=3) /* ty=Tensor[(1, 7, 7, 1024), int32] */;
%80 = qnn.requantize(%79, 0.000424646f /* ty=float32 */, 0 /* ty=int32 */, 0.0235285f /* ty=float32 */, 0 /* ty=int32 */, axis=3, out_dtype="uint8") /* ty=Tensor[(1, 7, 7, 1024), uint8] */;
%81 = cast(%80, dtype="int32") /* ty=Tensor[(1, 7, 7, 1024), int32] */;
%82 = nn.avg_pool2d(%81, pool_size=[7, 7], strides=[2, 2], padding=[0, 0, 0, 0], layout="NHWC") /* ty=Tensor[(1, 1, 1, 1024), int32] */;
%83 = cast(%82, dtype="uint8") /* ty=Tensor[(1, 1, 1, 1024), uint8] */;
%84 = qnn.conv2d(%83, %v_param_55, 0 /* ty=int32 */, 74 /* ty=int32 */, 0.0235285f /* ty=float32 */, 0.0049866f /* ty=float32 */, padding=[0, 0, 0, 0], channels=1001, kernel_size=[1, 1], data_layout="NHWC", kernel_layout="HWIO", out_dtype="int32") /* ty=Tensor[(1, 1, 1, 1001), int32] */;
%85 = nn.bias_add(%84, %v_param_56, axis=3) /* ty=Tensor[(1, 1, 1, 1001), int32] */;
%86 = qnn.requantize(%85, 0.000117327f /* ty=float32 */, 0 /* ty=int32 */, 0.166099f /* ty=float32 */, 66 /* ty=int32 */, axis=3, out_dtype="uint8") /* ty=Tensor[(1, 1, 1, 1001), uint8] */;
%87 = reshape(%86, newshape=[1, 1001]) /* ty=Tensor[(1, 1001), uint8] */;
%88 = qnn.dequantize(%87, 0.166099f /* ty=float32 */, 66 /* ty=int32 */) /* ty=Tensor[(1, 1001), float32] */;
%89 = nn.softmax(%88, axis=1) /* ty=Tensor[(1, 1001), float32] */;
qnn.quantize(%89, 0.00390625f /* ty=float32 */, 0 /* ty=int32 */, out_dtype="uint8") /* ty=Tensor[(1, 1001), uint8] */
}

Relay IR (partition_for_arm_compute_lib) —>

#[version = "0.0.5"]
def @arm_compute_lib_0(%arm_compute_lib_0_i0: Tensor[(1, 7, 7, 1024), uint8], global_symbol="arm_compute_lib_0", Primitive=1, Compiler="arm_compute_lib", Inline=1) -> Tensor[(1, 1, 1, 1024), uint8] {
  %2 = fn (%FunctionVar_0_0: Tensor[(1, 7, 7, 1024), uint8], PartitionedFromPattern="cast_nn.avg_pool2d_cast_", Composite="arm_compute_lib.avg_pool2d") -> Tensor[(1, 1, 1, 1024), uint8] {
    %0 = cast(%FunctionVar_0_0, dtype="int32") /* ty=Tensor[(1, 7, 7, 1024), int32] */;
    %1 = nn.avg_pool2d(%0, pool_size=[7, 7], strides=[2, 2], padding=[0, 0, 0, 0], layout="NHWC") /* ty=Tensor[(1, 1, 1, 1024), int32] */;
    cast(%1, dtype="uint8") /* ty=Tensor[(1, 1, 1, 1024), uint8] */
  };
  %2(%arm_compute_lib_0_i0) /* ty=Tensor[(1, 1, 1, 1024), uint8] */
}

def @arm_compute_lib_1(%arm_compute_lib_1_i0: Tensor[(1, 1, 1, 1001), uint8], global_symbol="arm_compute_lib_1", Primitive=1, Compiler="arm_compute_lib", Inline=1) -> Tensor[(1, 1001), uint8] {
  reshape(%arm_compute_lib_1_i0, newshape=[1, 1001]) /* ty=Tensor[(1, 1001), uint8] */
}

def @main(...)
{
...
%81 = qnn.conv2d(%80, %v_param_53, 0 /* ty=int32 */, 95 /* ty=int32 */, 0.0235285f /* ty=float32 */, 0.0180482f /* ty=float32 */, padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1], data_layout="NHWC", kernel_layout="HWIO", out_dtype="int32") /* ty=Tensor[(1, 7, 7, 1024), int32] */;
%82 = nn.bias_add(%81, %v_param_54, axis=3) /* ty=Tensor[(1, 7, 7, 1024), int32] */;
%83 = qnn.requantize(%82, 0.000424646f /* ty=float32 */, 0 /* ty=int32 */, 0.0235285f /* ty=float32 */, 0 /* ty=int32 */, axis=3, out_dtype="uint8") /* ty=Tensor[(1, 7, 7, 1024), uint8] */;
%84 = @arm_compute_lib_0(%83) /* ty=Tensor[(1, 1, 1, 1024), uint8] */;
%85 = qnn.conv2d(%84, %v_param_55, 0 /* ty=int32 */, 74 /* ty=int32 */, 0.0235285f /* ty=float32 */, 0.0049866f /* ty=float32 */, padding=[0, 0, 0, 0], channels=1001, kernel_size=[1, 1], data_layout="NHWC", kernel_layout="HWIO", out_dtype="int32") /* ty=Tensor[(1, 1, 1, 1001), int32] */;
%86 = nn.bias_add(%85, %v_param_56, axis=3) /* ty=Tensor[(1, 1, 1, 1001), int32] */;
%87 = qnn.requantize(%86, 0.000117327f /* ty=float32 */, 0 /* ty=int32 */, 0.166099f /* ty=float32 */, 66 /* ty=int32 */, axis=3, out_dtype="uint8") /* ty=Tensor[(1, 1, 1, 1001), uint8] */;
%88 = @arm_compute_lib_1(%87) /* ty=Tensor[(1, 1001), uint8] */;
%89 = qnn.dequantize(%88, 0.166099f /* ty=float32 */, 66 /* ty=int32 */) /* ty=Tensor[(1, 1001), float32] */;
%90 = nn.softmax(%89, axis=1) /* ty=Tensor[(1, 1001), float32] */;
qnn.quantize(%90, 0.00390625f /* ty=float32 */, 0 /* ty=int32 */, out_dtype="uint8") /* ty=Tensor[(1, 1001), uint8] */
}

My Problem :

Why qnn.conv2d, nn.bias_add and qnn.requantize not combine? 
Other patterns can do it, such as avg_pool2d_pattern.
The [qnn.conv2d, nn.bias_add,qnn.requantize] pattern will not hit.

Very thanks.

Best regards.

comaniac · January 15, 2021, 5:35pm

Please correct the title: “Arm Compute Library”, and please use the latest TVM to make sure you cover all the available changes and features.

nn.bias_add wasn’t matched because its input is not a constant. In the pattern:

pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))

It means only the nn.bias_add with the constant as the second argument will be matched. You could use mod["main"] = bind_params_by_name(mod["main"], params) to bind constant parameters to your model.

Ryan · January 15, 2021, 6:33pm

I have corrected the title. I will try this method. Very thanks.