Thanks for your reply~ The reason of using large graph is that the build time is too long for small subgraph lstm computation function. The build time of the lstm computation declaration as small subgraph will be more than 3723 s. Because the relay graph for this computation declaration is so large(more than 20000 lines even more). And this is too time-consuming… The small subgraph computation code (which is developed by me) is here(tvm/relay/frontend/pytorch.py):
def _lstm():
def lstm_cell(unbind_input, input_hidden, cell_param):
'''
unbind_input: 2D-Tensor
input_hidden: tuple(2D-Tensor, 2D-Tensor)
cell_param: A CellParams object
return a tuple (2D-Tensor, 2D-Tensor)
'''
hx = input_hidden[0] # hx is a 2D tensor
cx = input_hidden[1] # cx is a 2D tensor
linear_ih = cell_param.linear_ih(unbind_input)
linear_hh = cell_param.linear_hh(hx)
gates = _op.add(linear_ih, linear_hh)
chunked_gates = _op.split(gates, indices_or_sections=4, axis=1)
assert(len(chunked_gates) == 4)
in_gate = _op.sigmoid(chunked_gates[0])
forget_gate = _op.sigmoid(chunked_gates[1])
cell_gate = _op.tanh(chunked_gates[2])
out_gate = _op.sigmoid(chunked_gates[3])
cy = _op.add(_op.multiply(forget_gate, cx), _op.multiply(in_gate, cell_gate))
hy = _op.multiply(out_gate, _op.tanh(cy))
return hy, cy
def full_layer(_input_unbind_list, input_hidden, cell_param):
'''
_input_unbind_list: A list of Tensor [(2D-Tensor), (2D-Tensor), ... , (2D-Tensor)]
input_hidden: tuple(2D-Tensor, 2D-Tensor)
cell_param: A CellParams object
return step_outputs, hidden
'''
step_outputs = [] # step_outputs is a list of 2D-tensor [2D-tensor, 2D-tensor]
hidden = input_hidden
for i in range(len(_input_unbind_list)):
hy, cy = lstm_cell(_input_unbind_list[i], hidden, cell_param)
hidden = (hy, cy)
step_outputs.append(hy)
return step_outputs, hidden
def apply_layer_stack(_input_unbind_list, hiddens, cell_param_list, num_layers):
'''
_input_unbind_list: A list of Tensor [[1,240], [1,240], ... , [1, 240]]
hiddens is a list[tuple(2D-tensor, 2D-tensor), tuple(2D-tensor, 2D-tensor)]
cell_param_list: a list of CellParams , its length is equal to num_layer
num_layers: int
return: layer_input_list is a 2D-tensor List, final_hiddens is a list of each element is (2D-tensor, 2D-tensor)
'''
assert(len(hiddens) == num_layers)
assert(len(cell_param_list) == num_layers)
layer_input_list = _input_unbind_list
final_hiddens = []
for i in range(num_layers):
step_output_tensor_list, hidden = full_layer(layer_input_list, hiddens[i], cell_param_list[i])
final_hiddens.append(hidden)
layer_input_list = step_output_tensor_list
return layer_input_list, final_hiddens
def _lstm_impl(_input, cell_param_list, hx, cx, num_layers, dropout_p, train, bidirectional):
'''
_input: 3D Tensor [158,1,2048]
cell_param_list: a list of CellParams , its length is equal to num_layer
hx: a Tensor is a 3D-Tensor [2, 1, 1024]
cx: a Tensor is a 3D-Tensor [2, 1, 1024]
num_layer: int
'''
_input_unbind_list = unbind_func(_input)
layer_hx = unbind_func(hx) # layer_hx is a list which includes a 2D tensor
layer_cx = unbind_func(cx) # layer_cx is a list which includes a 2D tensor
assert (len(layer_hx) == len(layer_cx))
assert (len(cell_param_list) == len(layer_cx))
assert (len(cell_param_list) == num_layers)
total_layers = len(layer_hx)
# hiddens is a list[(2D-tensor, 2D-tensor), (2D-tensor, 2D-tensor)]
hiddens = []
for i in range(total_layers):
hiddens.append((layer_hx[i], layer_cx[i]))
layer_output_list, final_hiddens = apply_layer_stack(_input_unbind_list, hiddens, cell_param_list, num_layers)
layer_output = _op.stack(layer_output_list, axis=0)
assert(len(final_hiddens) == num_layers)
hy = []
cy = []
for i in range(len(final_hiddens)):
hy.append(final_hiddens[i][0])
cy.append(final_hiddens[i][1])
hy_stack = _op.stack(hy, axis=0)
cy_stack = _op.stack(cy, axis=0)
return layer_output, hy_stack, cy_stack
def _impl(inputs, input_types):
_input = inputs[0] # Tensor 3D-Tensor [316,1,240]
hx = inputs[1] # TensorList [(2,1,1024), (2,1,1025)] each tensor is a 3D-Tensor [2, 1, 1024]
_params = inputs[2] # TensorList
has_bias = inputs[3] # bool
num_layers = inputs[4] # int64_t
dropout_p = inputs[5] # double
train = inputs[6] # bool
bidirectional = inputs[7] # bool
batch_first = inputs[8] # bool
assert len(hx) == 2 # "lstm expects two hidden states"
cell_param_list = gather_params(_params, has_bias)
results = _lstm_impl(_input, cell_param_list, hx[0], hx[1], num_layers, dropout_p, train, bidirectional)
return results
return _impl
In order to decrease the size of relay graph size and build time, I redefine the lstm computation declaration with large graph like this code:
def _lstm_new():
def _lstm_impl(_input, cell_param_list, hx, cx, num_layers):
'''
_input: 3D Tensor [158,1,2048]
cell_param_list: a list of CellParams , its length is equal to num_layer
hx: a Tensor is a 3D-Tensor [2, 1, 1024]
cx: a Tensor is a 3D-Tensor [2, 1, 1024]
num_layer: int
return:
data: 3D Tensor
final_hidden: a list of 2D-Tensor [hy, cy]
'''
layer_hx = unbind_func(hx) # layer_hx is a list which includes a 2D tensor
layer_cx = unbind_func(cx) # layer_cx is a list which includes a 2D tensor
assert (len(layer_hx) == len(layer_cx))
assert (len(layer_hx) == num_layers)
assert (len(cell_param_list) == num_layers)
data = _input
final_hiddens = []
for i in range(num_layers):
out_data = _op.nn.lstm_layer(data, layer_hx[i], layer_cx[i],
cell_param_list[i].w_ih, cell_param_list[i].w_hh,
cell_param_list[i].b_ih, cell_param_list[i].b_hh, num_layers)
data = out_data
return data, None, None
def _impl(inputs, input_types):
_input = inputs[0] # Tensor 3D-Tensor [316,1,240]
hx = inputs[1] # TensorList [(2,1,1024), (2,1,1025)] each tensor is a 3D-Tensor [2, 1, 1024]
_params = inputs[2] # TensorList
has_bias = inputs[3] # bool
num_layers = inputs[4] # int64_t
dropout_p = inputs[5] # double
train = inputs[6] # bool
bidirectional = inputs[7] # bool
batch_first = inputs[8] # bool
assert len(hx) == 2 # "lstm expects two hidden states"
cell_param_list = gather_params(_params, has_bias)
results = _lstm_impl(_input, cell_param_list, hx[0], hx[1], num_layers)
return results
return _impl
In this way, the relay graph will be small and build time can be speeded up three times than before. And next I want to use auto-schedule to generate an optimal schedule to complete the import of RNNT model in TVM. It seems that the large graph is necessary for me to define the computation delaeration for lstm op. So, could you please give me some guidance for next step? Thank you very much