Sorry, I gave a inappropriate example. In the latest code, the hardswish and swish(including sigmoid) use lookup table while leakyrelu not.
In the examples below, I noticed qnn::leakyrelu
and qnn::hardswish
are replaced with some meta exprs. I guess they can not be excuted together with conv because TOpPattern
of some Exprs
(like layout_transform
) are no longer belong to kElemWise
. Dose it true?
If true, does it mean that it is impossible to do complete int8 activation fusion like fp32?
# conv
relay_primfuncs={llvm -keys=cpu -link-params=0 -mcpu=core-avx2: fn (%p0: Tensor[(128, 1, 640, 64, 4), uint8] /* ty=Tensor[(128, 1, 640, 64, 4), uint8] */, %p1: Tensor[(8, 1, 2, 64, 1, 16, 4), int8] /* ty=Tensor[(8, 1, 2, 64, 1, 16, 4), int8] */, %p2: Tensor[(1, 8, 1, 1, 16), int32] /* ty=Tensor[(1, 8, 1, 1, 16), int32] */, %p3: Tensor[(1, 8, 1, 1, 16), int32] /* ty=Tensor[(1, 8, 1, 1, 16), int32] */, %p4: Tensor[(1, 8, 1, 1, 16), float32] /* ty=Tensor[(1, 8, 1, 1, 16), float32] */, target=meta[Target][0], prim_funcs={'tvmgen_default_fused_nn_contrib_conv2d_NCHWc_subtract_add_cast_multiply_add_floor_cast_clip'=meta[tir.PrimFunc][0]}, out_layout="NCHW16c", data_layout="NCHW4c", hash="19115e71f466e294", kernel_layout="OIHW1i16o4i", prim_fn_var='tvmgen_default_fused_nn_contrib_conv2d_NCHWc_subtract_add_cast_multiply_add_floor_cast_clip', Primitive=1) -> Tensor[(128, 8, 639, 1, 16), int32] {
%0 = nn.contrib_conv2d_NCHWc(%p0, %p1, padding=[0, 0, 0, 0], channels=128, kernel_size=[2, 64], data_layout="NCHW4c", kernel_layout="OIHW1i16o4i", out_layout="NCHW16c", out_dtype="int32") /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
%1 = subtract(%0, %p2) /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
%2 = add(%1, %p3) /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
%3 = cast(%2, dtype="float32") /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%4 = multiply(%3, %p4) /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%5 = add(%4, 103.5f /* ty=float32 */) /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%6 = floor(%5) /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%7 = cast(%6, dtype="int32") /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
clip(%7, a_min=0f, a_max=255f) /* ty=Tensor[(128, 8, 639, 1, 16), int32] */
} /* ty=fn (Tensor[(128, 1, 640, 64, 4), uint8], Tensor[(8, 1, 2, 64, 1, 16, 4), int8], Tensor[(1, 8, 1, 1, 16), int32], Tensor[(1, 8, 1, 1, 16), int32], Tensor[(1, 8, 1, 1, 16), float32]) -> Tensor[(128, 8, 639, 1, 16), int32] */
# leakyrelu
relay_primfuncs={llvm -keys=cpu -link-params=0 -mcpu=core-avx2: fn (%p0: Tensor[(128, 8, 637, 1, 16), int32] /* ty=Tensor[(128, 8, 637, 1, 16), int32] */, %p1: int32 /* ty=int32 */, %p2: int32 /* ty=int32 */, src_layout="NCHW16c", hash="1e03279bb39a6741", prim_funcs={'tvmgen_default_fused_less_layout_transform_fixed_point_multiply_add_layout_transform_layout_tra_fd75043815579448__2'=meta[tir.PrimFunc][0]}, dst_layout="NCHW", Primitive=1, prim_fn_var='tvmgen_default_fused_less_layout_transform_fixed_point_multiply_add_layout_transform_layout_tra_fd75043815579448__2', target=meta[Target][0]) -> Tensor[(128, 128, 637, 1), uint8] {
%0 = less(%p0, %p1) /* ty=Tensor[(128, 8, 637, 1, 16), bool] */;
%1 = fixed_point_multiply(%p0, multiplier=1374389535, shift=-6) /* ty=Tensor[(128, 8, 637, 1, 16), int32] */;
%2 = add(%1, %p2) /* ty=Tensor[(128, 8, 637, 1, 16), int32] */;
%3 = layout_transform(%0, src_layout="NCHW16c", dst_layout="NCHW") /* ty=Tensor[(128, 128, 637, 1), bool] */;
%4 = layout_transform(%2, src_layout="NCHW16c", dst_layout="NCHW") /* ty=Tensor[(128, 128, 637, 1), int32] */;
%5 = layout_transform(%p0, src_layout="NCHW16c", dst_layout="NCHW") /* ty=Tensor[(128, 128, 637, 1), int32] */;
%6 = where(%3, %4, %5) /* ty=Tensor[(128, 128, 637, 1), int32] */;
%7 = clip(%6, a_min=0f, a_max=255f) /* ty=Tensor[(128, 128, 637, 1), int32] */;
cast(%7, dtype="uint8") /* ty=Tensor[(128, 128, 637, 1), uint8] */
} /* ty=fn (Tensor[(128, 8, 637, 1, 16), int32], int32, int32) -> Tensor[(128, 128, 637, 1), uint8] */
# conv
relay_primfuncs={llvm -keys=cpu -link-params=0 -mcpu=core-avx2: fn (%p0: Tensor[(128, 1, 640, 64, 4), uint8] /* ty=Tensor[(128, 1, 640, 64, 4), uint8] */, %p1: Tensor[(8, 1, 2, 64, 1, 16, 4), int8] /* ty=Tensor[(8, 1, 2, 64, 1, 16, 4), int8] */, %p2: Tensor[(1, 8, 1, 1, 16), int32] /* ty=Tensor[(1, 8, 1, 1, 16), int32] */, %p3: Tensor[(1, 8, 1, 1, 16), int32] /* ty=Tensor[(1, 8, 1, 1, 16), int32] */, %p4: Tensor[(1, 8, 1, 1, 16), float32] /* ty=Tensor[(1, 8, 1, 1, 16), float32] */, target=meta[Target][0], prim_funcs={'tvmgen_default_fused_nn_contrib_conv2d_NCHWc_subtract_add_cast_multiply_add_floor_cast_clip_cas_db782882eb0ba5d2_'=meta[tir.PrimFunc][0]}, out_layout="NCHW16c", data_layout="NCHW4c", hash="036bd901056bd0fa", kernel_layout="OIHW1i16o4i", prim_fn_var='tvmgen_default_fused_nn_contrib_conv2d_NCHWc_subtract_add_cast_multiply_add_floor_cast_clip_cas_db782882eb0ba5d2_', Primitive=1) -> Tensor[(128, 8, 639, 1, 16), uint8] {
%0 = nn.contrib_conv2d_NCHWc(%p0, %p1, padding=[0, 0, 0, 0], channels=128, kernel_size=[2, 64], data_layout="NCHW4c", kernel_layout="OIHW1i16o4i", out_layout="NCHW16c", out_dtype="int32") /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
%1 = subtract(%0, %p2) /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
%2 = add(%1, %p3) /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
%3 = cast(%2, dtype="float32") /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%4 = multiply(%3, %p4) /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%5 = add(%4, 68.5f /* ty=float32 */) /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%6 = floor(%5) /* ty=Tensor[(128, 8, 639, 1, 16), float32] */;
%7 = cast(%6, dtype="int32") /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
%8 = clip(%7, a_min=0f, a_max=255f) /* ty=Tensor[(128, 8, 639, 1, 16), int32] */;
# hardswish
%9 = cast(%8, dtype="uint8") /* ty=Tensor[(128, 8, 639, 1, 16), uint8] */;
reinterpret(%9, dtype="uint8") /* ty=Tensor[(128, 8, 639, 1, 16), uint8] */
relay_primfuncs={llvm -keys=cpu -link-params=0 -mcpu=core-avx2: fn (%p0: Tensor[(128, 8, 639, 1, 16), uint8] /* ty=Tensor[(128, 8, 639, 1, 16), uint8] */, %p1: Tensor[(256), uint8] /* ty=Tensor[(256), uint8] */, src_layout="NCHW16c", hash="8292eeffb256f4b0", prim_funcs={'tvmgen_default_fused_layout_transform_take'=meta[tir.PrimFunc][0]}, dst_layout="NCHW", Primitive=1, prim_fn_var='tvmgen_default_fused_layout_transform_take', target=meta[Target][0]) -> Tensor[(128, 128, 639, 1), uint8] {
%0 = layout_transform(%p0, src_layout="NCHW16c", dst_layout="NCHW") /* ty=Tensor[(128, 128, 639, 1), uint8] */;
take(%p1, %0, axis=0, mode="fast") /* ty=Tensor[(128, 128, 639, 1), uint8] */
} /* ty=fn (Tensor[(128, 8, 639, 1, 16), uint8], Tensor[(256), uint8]) -> Tensor[(128, 128, 639, 1), uint8] */