device: nvidia t4 cuda version 10.2
from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM
model = BertForMaskedLM.from_pretrained('bert-large-uncased')
bert_model_origin = bert_model_origin.to("cpu")
example_tensor = torch.randint(0, 100, (1, 256))
trace_model_fp32 = torch.jit.trace(bert_model_origin.to("cpu"), [example_tensor.to("cpu")])
shape_list = [(i.debugName().split('.')[0], i.type().sizes()) for i in list(trace_model_fp32.graph.inputs())[1:]]
mod_bert_fp32, params_bert_fp32 = tvm.relay.frontend.pytorch.from_pytorch(trace_model_fp32, shape_list)
def test_int8_bert_model_tvm(mod_bert, params_bert, dataset, infer_count=1000):
target = tvm.target.cuda()
dev = tvm.device(target.kind.name, 0)
def calibrate_dataset(calibrate_num=100, input_name='input_ids'):
for i in range(calibrate_num):
record = dataset[i]
index_tokens = record['index_tokens'].to("cpu")
index_tokens_tvm = tvm.nd.array(index_tokens.numpy(), dev)
yield {input_name: index_tokens_tvm}
with relay.quantize.qconfig(calibrate_mode="kl_divergence", weight_scale="max"):
mod = relay.quantize.quantize(mod_bert, params_bert, dataset=calibrate_dataset())
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target=target.kind.name, params=params_bert)
lib.export_library(os.path.realpath("bert_tvm_int8_cuda.tar"))
if infer_count != 0:
tvm_inference(lib, dataset, target, infer_count=infer_count)
the final score got unbelievable 0.003 the fp32 up to 0.886 accuracy