Hi, I am trying to dive down into the op fusion and graph optimization mechanism in tvm. I constructed two simple nets with three operators.
def layer_norm(data, gamma=None, beta=None, **kwargs):
name = kwargs.get("name")
kwargs.pop("name")
if not gamma:
gamma = relay.var(name + "_gamma")
if not beta:
beta = relay.var(name + "_beta")
return relay.nn.layer_norm(data, gamma=gamma, beta=beta, **kwargs)
def mat_mul(data1, data2, **kwargs):
name = kwargs.get("name")
kwargs.pop("name")
return relay.nn.matmul(data1, data2, **kwargs)
def simplenet(data1, data2, name):
matmul = mat_mul(data1, data2, name=name + '_mm')
ln = layer_norm(matmul, name=name + '_ln')
act = relay.nn.relu(data=ln)
return act
and
# BN
def batch_norm(data, gamma=None, beta=None, moving_mean=None, moving_var=None, **kwargs):
name = kwargs.get("name")
kwargs.pop("name")
if not gamma:
gamma = relay.var(name + "_gamma")
if not beta:
beta = relay.var(name + "_beta")
if not moving_mean:
moving_mean = relay.var(name + "_moving_mean")
if not moving_var:
moving_var = relay.var(name + "_moving_var")
return relay.nn.batch_norm(data, gamma=gamma, beta=beta, moving_mean=moving_mean, moving_var=moving_var, **kwargs)[0]
# conv2d
def conv2d(data, weight=None, **kwargs):
name = kwargs.get("name")
kwargs.pop("name")
if not weight:
weight = relay.var(name + "_weight")
return relay.nn.conv2d(data, weight, **kwargs)
# conv2d+BN+ReLU
def simplenet(data, name, channels, kernel_size=(3, 3), strides=(1, 1), padding=(1, 1), epsilon=1e-5):
conv = conv2d(
data=data,
channels=channels,
kernel_size=kernel_size,
strides=strides,
padding=padding,
data_layout='NCHW',
name=name+'_conv')
bn = batch_norm(data=conv, epsilon=epsilon, name=name + '_bn')
act = relay.nn.relu(data=bn)
return act
when I was executing relay.transform.FuseOps()
, for the first net, I got:
Attribute TOpPattern has not been registered for nn.layer_norm
And FuseOPs for the second net with batch_norm just works well. I have noticed that RELAY_REGISTER_OP("nn.batch_norm")
does set a kOutEWiseFusable attr for batch_norm, while RELAY_REGISTER_OP("nn.layer_norm")
does not.
I have referred to https://discuss.tvm.apache.org/t/why-doesnt-nn-layer-norm-have-toppattern/7046, it seems like a ‘TOpPattern’ will hold the optimization back from doing some split. But I wonder why this issue does not apply to batch_norm? What is the main difference?
Any help would be appreciated!