Raise RuntimeError during tuning

llunncai · May 13, 2020, 7:53am

Hi All,

I’m trying to use XGBoostTuner to tune a tensorflow GraphDef. During tuning, the program raise a RuntimeError:

[Task  1/29]  Current/Best:  855.03/ 855.03 GFLOPS | Progress: (10/10) | 19.83 s Done.
[Task  2/29]  Current/Best:  900.18/ 900.18 GFLOPS | Progress: (10/10) | 14.18 s Done.
[Task  3/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (10/10) | 1.55 s Done.
[Task  4/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (10/10) | 1.54 s Done.
[Task  5/29]  Current/Best:  456.57/ 456.57 GFLOPS | Progress: (10/10) | 11.43 s Done.
[Task  6/29]  Current/Best:   58.65/  58.65 GFLOPS | Progress: (10/10) | 6.24 s Done.
[Task  7/29]  Current/Best:  181.31/1047.29 GFLOPS | Progress: (10/10) | 11.71 s Done.
[Task  8/29]  Current/Best:  137.09/2866.60 GFLOPS | Progress: (10/10) | 16.37 s Done.
[Task  9/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (10/10) | 13.41 s Done.
[Task 10/29]  Current/Best:   61.00/  69.72 GFLOPS | Progress: (10/10) | 7.20 s Done.
[Task 11/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (10/10) | 25.81 s Done.
[Task 12/29]  Current/Best: 2322.48/2322.48 GFLOPS | Progress: (10/10) | 8.31 s Done.
[Task 13/29]  Current/Best: 1629.79/1761.38 GFLOPS | Progress: (10/10) | 17.89 s Done.
[Task 14/29]  Current/Best: 1316.37/1316.37 GFLOPS | Progress: (10/10) | 14.23 s Done.
[Task 15/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (10/10) | 41.41 s Done.
[Task 16/29]  Current/Best:  124.88/ 124.88 GFLOPS | Progress: (10/10) | 14.47 s Done.
[Task 17/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (10/10) | 18.85 s Done.
[Task 18/29]  Current/Best:  248.92/ 248.92 GFLOPS | Progress: (10/10) | 18.38 s Done.
[Task 19/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (10/10) | 13.27 s Done.
[Task 20/29]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 sTraceback (most recent call last):

  File "convert_model.py", line 172, in <module>
    tune_and_optimize()

  File "convert_model.py", line 85, in tune_and_optimize
    tune_config=tune_config

  File "/root/code/tf_tvm/tf_tvm/tf_tvm_converter.py", line 352, in optimize_sub_graph
    log_filename=tvm_log_file_path

  File "/root/code/tf_tvm/tf_tvm/tf_tvm_converter.py", line 412, in tune_tasks
    autotvm.callback.log_to_file(tmp_log_file)])

  File "/root/code/git/tvm/python/tvm/autotvm/tuner/xgboost_tuner.py", line 90, in tune
    super(XGBTuner, self).tune(*args, **kwargs)

  File "/root/code/git/tvm/python/tvm/autotvm/tuner/tuner.py", line 131, in tune
    results = measure_batch(inputs)

  File "/root/code/git/tvm/python/tvm/autotvm/measure/measure.py", line 262, in measure_batch
    results = runner.run(measure_inputs, build_results)

  File "/root/code/git/tvm/python/tvm/autotvm/measure/measure_methods.py", line 278, in run
    raise Exception(f'encountered exception during measurement: {results}')

Exception: encountered exception during measurement: [MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n  
[bt] (5) /root/code/git/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f272e626d11]\n  
[bt] (4) /root/code/git/tvm/build/libtvm.so(+0x15ae132) [0x7f272e6a2132]\n  
[bt] (3) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x21f) [0x7f272e6a5b8f]\n  
[bt] (2) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)+0x57) [0x7f272e690e37]\n  
[bt] (1) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)>)+0x36e) [0x7f272e688f9e]\n  
[bt] (0) /root/code/git/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x64) [0x7f272dcb01e4]\n  
File "/root/code/git/tvm/src/runtime/rpc/rpc_endpoint.cc", line 799\n
TVMError: Check failed: code == RPCCode: :kReturn: code=1'),), error_no=4, all_cost=5.984677791595459, timestamp=1589350788.8437154), MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  
[bt] (4) /root/code/git/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f272e626d11]\n  
[bt] (3) /root/code/git/tvm/build/libtvm.so(+0xc01bb1) [0x7f272dcf5bb1]\n  
[bt] (2) /root/code/git/tvm/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ca) [0x7f272dcf464a]\n  
[bt] (1) /root/code/git/tvm/build/libtvm.so(tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x50d) [0x7f272e0538ed]\n  
[bt] (0) /root/code/git/tvm/build/libtvm.so(+0x152f83b) [0x7f272e62383b]\n  
File "/root/code/git/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 78, in cfun\n    
rv = local_pyfunc(*pyargs)\n  
File "/root/code/git/tvm/python/tvm/autotvm/measure/measure_methods.py", line 623, in verify_pass\n    
raise InstantiationError("Skipped because of invalid gpu kernel")\n
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel'),), error_no=1, all_cost=0.14818263053894043, timestamp=1589350780.9490929), MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n  
[bt] (5) /root/code/git/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f272e626d11]\n  
[bt] (4) /root/code/git/tvm/build/libtvm.so(+0x15ae132) [0x7f272e6a2132]\n  
[bt] (3) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x21f) [0x7f272e6a5b8f]\n  
[bt] (2) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)+0x57) [0x7f272e690e37]\n  
[bt] (1) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)>)+0x36e) [0x7f272e688f9e]\n  
[bt] (0) /root/code/git/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x64) [0x7f272dcb01e4]\n  
File "/root/code/git/tvm/src/runtime/rpc/rpc_endpoint.cc", line 799\nTVMError: Check failed: code == RPCCode: :kReturn: code=1'),), error_no=4, all_cost=6.203210830688477, timestamp=1589350794.2938511), MeasureResult(costs=(InstantiationError('Traceback (most recent call last):\n  [bt] (4) /root/code/git/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f272e626d11]\n  
[bt] (3) /root/code/git/tvm/build/libtvm.so(+0xc01bb1) [0x7f272dcf5bb1]\n  
[bt] (2) /root/code/git/tvm/build/libtvm.so(tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x3ca) [0x7f272dcf464a]\n  
[bt] (1) /root/code/git/tvm/build/libtvm.so(tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const+0x50d) [0x7f272e0538ed]\n  
[bt] (0) /root/code/git/tvm/build/libtvm.so(+0x152f83b) [0x7f272e62383b]\n  File "/root/code/git/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 78, in cfun\n    
rv = local_pyfunc(*pyargs)\n  
File "/root/code/git/tvm/python/tvm/autotvm/measure/measure_methods.py", line 623, in verify_pass\n    
raise InstantiationError("Skipped because of invalid gpu kernel")\n
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel'),), error_no=1, all_cost=0.14766669273376465, timestamp=1589350781.224462), MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n  
[bt] (5) /root/code/git/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f272e626d11]\n  
[bt] (4) /root/code/git/tvm/build/libtvm.so(+0x15ae132) [0x7f272e6a2132]\n  
[bt] (3) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x21f) [0x7f272e6a5b8f]\n  
[bt] (2) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)+0x57) [0x7f272e690e37]\n  
[bt] (1) /root/code/git/tvm/build/libtvm.so(tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)>)+0x36e) [0x7f272e688f9e]\n  
[bt] (0) /root/code/git/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x64) [0x7f272dcb01e4]\n  File "/root/code/git/tvm/src/runtime/rpc/rpc_endpoint.cc", line 799\n
TVMError: Check failed: code == RPCCode: :kReturn: code=1'),), error_no=4, all_cost=5.874715089797974, timestamp=1589350799.7380528), MeasureResult(costs=('Traceback (most recent call last):\n  
[bt] (3) /root/code/git/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f5a02e22d11]\n  
[bt] (2) /root/code/git/tvm/build/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), tvm::runtime::PackedFunc tvm::runtime::detail::PackFuncVoidAddr_<4, tvm::runtime::CUDAWrappedFunc>(tvm::runtime::CUDAWrappedFunc, std::vector<tvm::runtime::detail::ArgConvertCode, std::allocator<tvm::runtime::detail::ArgConvertCode> > const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0xbc) [0x7f5a02ec646c]\n  
[bt] (1) /root/code/git/tvm/build/libtvm.so(tvm::runtime::CUDAWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*, void**) const+0x665) [0x7f5a02ec5f45]\n  
[bt] (0) /root/code/git/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x64) [0x7f5a024ac1e4]\n  File "/root/code/git/tvm/src/runtime/cuda/cuda_module.cc", line 190\n  File "/root/code/git/tvm/src/runtime/rpc/rpc_endpoint.cc", line 370\nRPCError: Error caught from RPC call:\n[14:20:00] /root/code/git/tvm/src/runtime/library_module.cc:78: Check failed: ret == 0 (-1 vs. 0) : TVMError: CUDALaunch Error: CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES\n 
grid=(1,4,16),  block=(29,16,1)\n
// func_name=default_function_kernel1\n
// CUDA Source\n
// -----------\n
//\n
// Generated by NVIDIA NVVM Compiler\n
//\n
// Compiler Build ID: CL-24817639\n
// Cuda compilation tools, release 10.0, V10.0.130\n
// Based on LLVM 3.4svn\n
//\n\n
.version 6.3
\n.target sm_61
\n.address_size 64\n\n\t
// .globl\tdefault_function_kernel1\n
// _ZZ24default_function_kernel1E18kernel_pack_shared has been demoted\n
// _ZZ24default_function_kernel1E16data_pack_shared has been demoted\n\n.visible .entry default_function_kernel1
(\n\t.param .u64 default_function_kernel1_param_0,\n\t.param .u64 default_function_kernel1_param_1,\n\t.param .u64 default_function_kernel1_param_2\n)\n
{\n\t.reg .pred \t%p<54>;\n\t.reg .f32 \t%f<747>;\n\t.reg .b32 \t%r<248>;\n\t.reg .b64 \t%rd<17>;\n\t// demoted variable\n\t.shared .align 4 .b8 _ZZ24default_function_kernel1E18kernel_pack_shared[2048];\n\t// demoted variable\n\t.shared .align 4 .b8 _ZZ24default_function_kernel1E16data_pack_shared[26912];\n\n\tld.param.u64 \t%rd2, [default_function_kernel1_param_0];\n\tld.param.u64 \t%rd3, [default_function_kernel1_param_1];\n\tmov.u32 \t%r246, 0;\n\tmov.f32 \t%f515, 0f00000000;\n\tcvta.to.global.u64 \t%rd11, %rd3;\n\tmov.f32 \t%f516, %f515;\n\tmov.f32 \t%f517, %f515;\n\tmov.f32 \t%f518, %f515;\n\tmov.f32 \t%f519, %f515;\n\tmov.f32 \t%f520, %f515;\n\tmov.f32 \t%f521, %f515;\n\tmov.f32 \t%f522, %f515;\n\tmov.f32 \t%f523, %f515;\n\tmov.f32 \t%f524, %f515;\n\tmov.f32 \t%f525, %f515;\n\tmov.f32 \t%f526, %f515;\n\tmov.f32 \t%f527, %f515;\n\tmov.f32 \t%f528, %f515;\n\tmov.f32 \t%f529, %f515;\n\tmov.f32 \t%f530, %f515;\n\tmov.f32 \t%f531, %f515;\n\tmov.f32 \t%f532, %f515;\n\tmov.f32 \t%f533, %f515;\n\tmov.f32 \t%f534, %f515;\n\tmov.f32 \t%f535, %f515;\n\tmov.f32 \t%f536, %f515;\n\tmov.f32 \t%f537, %f515;\n\tmov.f32 \t%f538, %f515;\n\tmov.f32 \t%f539, %f515;\n\tmov.f32 \t%f540, %f515;\n\tmov.f32 \t%f541, %f515;\n\tmov.f32 \t%f542, %f515;\n\tmov.f32 \t%f543, %f515;\n\tmov.f32 \t%f544, %f515;\n\tmov.f32 \t%f545, %f515;\n\tmov.f32 \t%f546, %f515;\n\tmov.f32 \t%f547, %f515;\n\tmov.f32 \t%f548, %f515;\n\tmov.f32 \t%f549, %f515;\n\tmov.f32 \t%f550, %f515;\n\tmov.f32 \t%f551, %f515;\n\tmov.f32 \t%f552, %f515;\n\tmov.f32 \t%f553, %f515;\n\tmov.f32 \t%f554, %f515;\n\tmov.f32 \t%f555, %f515;\n\tmov.f32 \t%f556, %f515;\n\tmov.f32 \t%f557, %f515;\n\tmov.f32 \t%f558, %f515;\n\tmov.f32 \t%f559, %f515;\n\tmov.f32 \t%f560, %f515;\n\tmov.f32 \t%f561, %f515;\n\tmov.f32 \t%f562, %f515;\n\tmov.f32 \t%f563, %f515;\n\tmov.f32 \t%f564, %f515;\n\tmov.f32 \t%f565, %f515;\n\tmov.f32 \t%f566, %f515;\n\tmov.f32 \t%f567, %f515;\n\tmov.f32 \t%f568, %f515;\n\tmov.f32 \t%f569, %f515;\n\tmov.f32 \t%f570, %f515;\n\tmov.f32 \t%f571, %f515;\n\tmov.f32 \t%f572, %f515;\n\tmov.f32 \t%f573, %f515;\n\tmov.f32 \t%f574, %f515;\n\tmov.f32 \t%f575, %f515;\n\tmov.f32 \t%f576, %f515;\n\tmov.f32 \t%f577, %f515;\n\tmov.f32 \t%f578, %f515;\n\tmov.f32 \t%f579, %f515;\n\tmov.f32 \t%f580, %f515;\n\tmov.f32 \t%f581, %f515;\n\tmov.f32 \t%f582, %f515;\n\tmov.f32 \t%f583, %f515;\n\tmov.f32 \t%f584, %f515;\n\tmov.f32 \t%f585, %f515;\n\tmov.f32 \t%f586, %f515;\n\tmov.f32 \t%f587, %f515;\n\tmov.f32 \t%f588, %f515;\n\tmov.f32 \t%f589, %f515;\n\tmov.f32 \t%f590, %f515;\n\tmov.f32 \t%f591, %f515;\n\tmov.f32 \t%f592, %f515;\n\tmov.f32 \t%f593, %f515;\n\tmov.f32 \t%f594, %f515;\n\tmov.f32 \t%f595, %f515;\n\tmov.f32 \t%f596, %f515;\n\tmov.f32 \t%f597, %f515;\n\tmov.f32 \t%f598, %f515;\n\tmov.f32 \t%f599, %f515;\n\tmov.f32 \t%f600, %f515;\n\tmov.f32 \t%f601, %f515;\n\tmov.f32 \t%f602, %f515;\n\tmov.f32 \t%f603, %f515;\n\tmov.f32 \t%f604, %f515;\n\tmov.f32 \t%f605, %f515;\n\tmov.f32 \t%f606, %f515;\n\tmov.f32 \t%f607, %f515;\n\tmov.f32 \t%f608, %f515;\n\tmov.f32 \t%f609, %f515;\n\tmov.f32 \t%f610, %f515;\n\tmov.f32 \t%f611, %f515;\n\tmov.f32 \t%f612, %f515;\n\tmov.f32 \t%f613, %f515;\n\tmov.f32 \t%f614, %f515;\n\tmov.f32 \t%f615, %f515;\n\tmov.f32 \t%f616, %f515;\n\tmov.f32 \t%f617, %f515;\n\tmov.f32 \t%f618, %f515;\n\tmov.f32 n\tret;\n}\n\n\n\n',), error_no=7, all_cost=4, timestamp=1589350800.3609674)]

Process Process-1:
Traceback (most recent call last):
  File "/root/code/git/tvm/python/tvm/rpc/base.py", line 164, in connect_with_retry
    sock.connect(addr)
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/root/code/git/tvm/python/tvm/rpc/server.py", line 196, in _listen_loop
    raise exc
  File "/root/code/git/tvm/python/tvm/rpc/server.py", line 176, in _listen_loop
    tracker_conn = base.connect_with_retry(tracker_addr)
  File "/root/code/git/tvm/python/tvm/rpc/base.py", line 172, in connect_with_retry
    "Failed to connect to server %s" % str(addr))
RuntimeError: Failed to connect to server ('0.0.0.0', 9001)

Could anyone help me out? Thanks a lot!

huochaitiantang · May 13, 2020, 9:43am

Hi, I encountered the same problem, this may be a bug introduced by pr-5417. I have submitted pr-5586 to fix it.

llunncai · May 13, 2020, 10:18am

Thanks, @huochaitiantang. You are a life saver!