Hello,
I am trying to Auto-tuning with mps.
Auto-tuning works fine without using mps. But after I start mps with
nvidia-cuda-mps-control -d
and echo set_default_active_thread_percentage 10 |nvidia-cuda-mps-control
,
error occur in Auto-tuning. The output is
[Task 1/16] Current/Best: 76.56/3196.54 GFLOPS | Progress: (32/1000) | 62.25 s/home/ly/.local/lib/python3.6/site-packages/xgboost/training.py:17: UserWarning: Old style callback is deprecated. See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html
warnings.warn(f'Old style callback is deprecated. See: {link}', UserWarning)
[Task 1/16] Current/Best: 0.00/3196.54 GFLOPS | Progress: (64/1000) | 219.53 s
[Task 1/16] Current/Best: 0.00/3196.54 GFLOPS | Progress: (96/1000) | 365.95 s
[Task 1/16] Current/Best: 0.00/3196.54 GFLOPS | Progress: (128/1000) | 590.82 s
[Task 1/16] Current/Best: 0.00/3196.54 GFLOPS | Progress: (160/1000) | 709.84 s
[Task 1/16] Current/Best: 0.00/3196.54 GFLOPS | Progress: (192/1000) | 917.30 sWARNING:autotvm:Too many errors happen in the tuning. Switching to debug mode.
DEBUG:autotvm:No: 193 GFLOPS: 0.00/3196.54 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n 80: 0xffffffffffffffff\n 79: 0x000000000041da3f\n 78: __libc_start_main\n 77: main\n at ./Programs/python.c:69\n 76: Py_Main\n at Modules/main.c:751\n 75: RunModule\n at Modules/main.c:215\n 74: PyObject_Call\n at Objects/abstract.c:2261\n 73: function_call\n at Objects/funcobject.c:604\n 72: PyEval_EvalCodeEx\n at Python/ceval.c:4187\n 71: _PyEval_EvalCodeWithName\n at Python/ceval.c:4166\n 70: PyEval_EvalFrameEx\n at Python/ceval.c:754\n 69: _PyEval_EvalFrameDefault\n at Python/ceval.c:3335\n 68: call_function\n at Python/ceval.c:4872\n 67: fast_function\n at Python/ceval.c:4992\n 66: _PyEval_EvalCodeWithName\n at Python/ceval.c:4166\n 65: PyEval_EvalFrameEx\n at Python/ceval.c:754\n 64: _PyEval_EvalFrameDefault\n at Python/ceval.c:3335\n 63: call_function\n at Python/ceval.c:4851\n 62: _PyCFunction_FastCallKeywords\n at Objects/methodobject.c:294\n 61: _PyCFunction_FastCallDi',),), error_no=MeasureErrorNo.RUNTIME_DEVICE, all_cost=5.854816436767578, timestamp=1656854655.196452) [('tile_f', [-1, 1, 16, 1]), ('tile_y', [-1, 2, 1, 28]), ('tile_x', [-1, 2, 2, 1]), ('tile_rc', [-1, 1]), ('tile_ry', [-1, 1]), ('tile_rx', [-1, 7]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,59410786
DEBUG:autotvm:No: 194 GFLOPS: 0.00/3196.54 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n 80: 0xffffffffffffffff\n 79: 0x000000000041da3f\n 78: __libc_start_main\n 77: main\n at ./Programs/python.c:69\n 76: Py_Main\n at Modules/main.c:751\n 75: RunModule\n at Modules/main.c:215\n 74: PyObject_Call\n at Objects/abstract.c:2261\n 73: function_call\n at Objects/funcobject.c:604\n 72: PyEval_EvalCodeEx\n at Python/ceval.c:4187\n 71: _PyEval_EvalCodeWithName\n at Python/ceval.c:4166\n 70: PyEval_EvalFrameEx\n at Python/ceval.c:754\n 69: _PyEval_EvalFrameDefault\n at Python/ceval.c:3335\n 68: call_function\n at Python/ceval.c:4872\n 67: fast_function\n at Python/ceval.c:4992\n 66: _PyEval_EvalCodeWithName\n at Python/ceval.c:4166\n 65: PyEval_EvalFrameEx\n at Python/ceval.c:754\n 64: _PyEval_EvalFrameDefault\n at Python/ceval.c:3335\n 63: call_function\n at Python/ceval.c:4851\n 62: _PyCFunction_FastCallKeywords\n at Objects/methodobject.c:294\n 61: _PyCFunction_FastCallDi',),), error_no=MeasureErrorNo.RUNTIME_DEVICE, all_cost=6.979259014129639, timestamp=1656854659.4935715) [('tile_f', [-1, 1, 16, 1]), ('tile_y', [-1, 2, 1, 28]), ('tile_x', [-1, 2, 2, 1]), ('tile_rc', [-1, 1]), ('tile_ry', [-1, 1]), ('tile_rx', [-1, 7]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,72581986
DEBUG:autotvm:No: 195 GFLOPS: 0.00/3196.54 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n 80: 0xffffffffffffffff\n 79: 0x000000000041da3f\n 78: __libc_start_main\n 77: main\n at ./Programs/python.c:69\n 76: Py_Main\n at Modules/main.c:751\n 75: RunModule\n at Modules/main.c:215\n 74: PyObject_Call\n at Objects/abstract.c:2261\n 73: function_call\n at Objects/funcobject.c:604\n 72: PyEval_EvalCodeEx\n at Python/ceval.c:4187\n 71: _PyEval_EvalCodeWithName\n at Python/ceval.c:4166\n 70: PyEval_EvalFrameEx\n at Python/ceval.c:754\n 69: _PyEval_EvalFrameDefault\n at Python/ceval.c:3335\n 68: call_function\n at Python/ceval.c:4872\n 67: fast_function\n at Python/ceval.c:4992\n 66: _PyEval_EvalCodeWithName\n at Python/ceval.c:4166\n 65: PyEval_EvalFrameEx\n at Python/ceval.c:754\n 64: _PyEval_EvalFrameDefault\n at Python/ceval.c:3335\n 63: call_function\n at Python/ceval.c:4851\n 62: _PyCFunction_FastCallKeywords\n at Objects/methodobject.c:294\n 61: _PyCFunction_FastCallDi',),), error_no=MeasureErrorNo.RUNTIME_DEVICE, all_cost=5.633193492889404, timestamp=1656854663.790965) [('tile_f', [-1, 1, 16, 1]), ('tile_y', [-1, 2, 1, 28]), ('tile_x', [-1, 2, 2, 1]), ('tile_rc', [-1, 1]), ('tile_ry', [-1, 1]), ('tile_rx', [-1, 7]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,19897186
In log file generated by auto-tunning, many entrys after 32, error codes are 4.
RUNTIME_DEVICE = 4 # error when run program on device
I noticed that the mps option exists in tvm build/config.cmake
# Whether use MPS:
set(USE_MPS OFF)
I set it to on and recompile the tvm, but the problem remains.
code demo
NETWORK = "resnet-18"
TUNING_PATH = 'resnet-18-10%.json'
TARGET = 'cuda'
#TARGET = "llvm -mcpu=skylake-avx512"
dtype = "float32"
import tvm.relay as relay
import tvm
import tvm.relay.testing
from tvm.contrib import graph_executor
import tvm.auto_scheduler as auto_scheduler
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm
def get_model(name, batch_size):
# Get the symbol definition and random weight of a network
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)
# load some pre-defined network from: tvm.relay.testing
if "resnet" in name:
n_layer = int(name.split("-")[1])
mod, params = relay.testing.resnet.get_workload(
num_layers=n_layer, batch_size=batch_size, dtype=dtype
)
return mod, params, input_shape, output_shape
def model_compile(tuning=False):
# 加载模型
mod, params, input_shape, out_shape = get_model(NETWORK, batch_size=1)
# 抽取任务
tasks = autotvm.task.extract_from_program(
mod["main"], target=TARGET, params=params)
# 是否应用tuning记录
if tuning:
with autotvm.apply_history_best(TUNING_PATH):
with tvm.transform.PassContext(opt_level=3, config={}):
lib = relay.build(mod, target=TARGET, params=params)
else:
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target=TARGET, params=params)
dev = tvm.device(str(TARGET), 0)
module = graph_executor.GraphModule(lib["default"](dev))
return module, mod, params
def model_tuning(mod, params):
runner = autotvm.LocalRunner(
number=10,
repeat=3,
timeout=4,
min_repeat_ms=150,
#enable_cpu_cache_flush=True,
)
tuning_option = {
"tuner": "xgb",
"trials": 200,
"early_stopping": 500,
"measure_option": autotvm.measure_option(
builder=autotvm.LocalBuilder(timeout=10), runner=runner
),
"tuning_records": TUNING_PATH,
}
# begin by extracting the tasks from
tasks = autotvm.task.extract_from_program(mod["main"], target=TARGET, params=params, ops=(relay.op.get("nn.conv2d"),))
# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
tuner_obj = XGBTuner(task, loss_type="rank")
tuner_obj.tune(
n_trial=min(tuning_option["trials"], len(task.config_space)),
early_stopping=tuning_option["early_stopping"],
measure_option=tuning_option["measure_option"],
callbacks=[
autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
autotvm.callback.log_to_file(tuning_option["tuning_records"]),
],
)
module, mod, params = model_compile()
model_tuning(mod, params)
tvm:v0.8.0
OS: CentOS Linux release 7.9.2009
target hardware device: A100-PCIE-40GB
cuda:11