Hi everyone, I’m trying to tune a simple TIR module with meta_schedule.tune_tir for Vulkan on both Linux and Windows. Both TVM are built from source, on version 0.20.0. The Linux one works fine and tunes successfully, but the Windows one doesn’t seem to generate legal GPU code, so I can see errors like this in tune logs:
2025-09-05 14:24:16 [INFO] [task_scheduler.cc:164] Total 1 design space(s) generated
2025-09-05 14:24:16 [INFO] [task_scheduler.cc:170] Design space #0:
# from tvm.script import ir as I
# from tvm.script import tir as T
@I.ir_module
class Module:
@T.prim_func
def main(X: T.Buffer((1, 8192), "int8"), W: T.Buffer((8192, 1024), "int8"), B: T.Buffer((1, 1024), "int16"), Y: T.Buffer((1, 1024), "int16")):
T.func_attr({"tir.const_params": ["B", "W"], "tir.noalias": T.bool(True)})
with T.block("root"):
T.reads()
T.writes()
T.block_attr({"meta_schedule.unroll_explicit": 0})
sum_local = T.alloc_buffer((1, 1024), "int16", scope="local")
X_shared = T.alloc_buffer((1, 8192), "int8", scope="shared")
W_shared = T.alloc_buffer((8192, 1024), "int8", scope="shared")
for i_0_j_0_fused in T.thread_binding(16, thread="blockIdx.x"):
for i_1_j_1_fused in T.thread_binding(8, thread="vthread.x"):
for i_2_j_2_fused in T.thread_binding(1, thread="threadIdx.x"):
for k_0 in range(128):
for ax0_ax1_fused in range(64):
with T.block("X_shared"):
v0 = T.axis.spatial(1, 0)
v1 = T.axis.spatial(8192, k_0 * 64 + ax0_ax1_fused)
T.reads(X[v0, v1])
T.writes(X_shared[v0, v1])
T.block_attr({"meta_schedule.cooperative_fetch": 1})
X_shared[v0, v1] = X[v0, v1]
for ax0_ax1_fused in range(4096):
with T.block("W_shared"):
v0 = T.axis.spatial(8192, k_0 * 64 + ax0_ax1_fused // 64)
v1 = T.axis.spatial(1024, i_0_j_0_fused * 64 + ax0_ax1_fused % 64)
T.reads(W[v0, v1])
T.writes(W_shared[v0, v1])
T.block_attr({"meta_schedule.cooperative_fetch": 2})
W_shared[v0, v1] = W[v0, v1]
for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(8, 1, 8, 8, 1, 1):
with T.block("B_update"):
vi = T.axis.spatial(1, i_3 + i_4)
vj = T.axis.spatial(1024, i_0_j_0_fused * 64 + i_1_j_1_fused * 8 + j_3 + j_4)
vk = T.axis.reduce(8192, k_0 * 64 + k_1 * 8 + k_2)
T.reads(X_shared[vi, vk], W_shared[vk, vj])
T.writes(sum_local[vi, vj])
T.block_attr({"meta_schedule.thread_extent_high_inclusive": 1024, "meta_schedule.thread_extent_low_inclusive": 1, "meta_schedule.tiling_structure": "SSSRRSRS"})
with T.init():
sum_local[vi, vj] = T.int16(0)
sum_local[vi, vj] = sum_local[vi, vj] + T.Cast("int16", X_shared[vi, vk]) * T.Cast("int16", W_shared[vk, vj])
for ax0, ax1 in T.grid(1, 8):
with T.block("sum_local"):
v0 = T.axis.spatial(1, ax0)
v1 = T.axis.spatial(1024, i_0_j_0_fused * 64 + i_1_j_1_fused * 8 + ax1)
T.reads(sum_local[v0, v1], B[v0, v1])
T.writes(Y[v0, v1])
Y[v0, v1] = sum_local[v0, v1] + B[v0, v1]
b0 = sch.get_block(name="B_update", func_name="main")
b1 = sch.get_block(name="Y_update", func_name="main")
b2 = sch.get_block(name="root", func_name="main")
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
l3, l4, l5 = sch.get_loops(block=b0)
v6, v7, v8, v9, v10 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1])
l11, l12, l13, l14, l15 = sch.split(loop=l3, factors=[v6, v7, v8, v9, v10], preserve_unit_iters=True, disable_predication=False)
v16, v17, v18, v19, v20 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[16, 8, 1, 8, 1])
l21, l22, l23, l24, l25 = sch.split(loop=l4, factors=[v16, v17, v18, v19, v20], preserve_unit_iters=True, disable_predication=False)
v26, v27, v28 = sch.sample_perfect_tile(loop=l5, n=3, max_innermost_factor=64, decision=[128, 8, 8])
l29, l30, l31 = sch.split(loop=l5, factors=[v26, v27, v28], preserve_unit_iters=True, disable_predication=False)
sch.reorder(l11, l21, l12, l22, l13, l23, l29, l30, l14, l24, l31, l15, l25)
l32 = sch.fuse(l11, l21, preserve_unit_iters=True)
sch.bind(loop=l32, thread_axis="blockIdx.x")
l33 = sch.fuse(l12, l22, preserve_unit_iters=True)
sch.bind(loop=l33, thread_axis="vthread.x")
l34 = sch.fuse(l13, l23, preserve_unit_iters=True)
sch.bind(loop=l34, thread_axis="threadIdx.x")
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=1)
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)
b35 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")
sch.reverse_compute_at(block=b35, loop=l34, preserve_unit_loops=True, index=-1)
b36 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared", consumer_blocks=[b0])
sch.compute_at(block=b36, loop=l29, preserve_unit_loops=True, index=-1)
l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b36)
l43 = sch.fuse(l41, l42, preserve_unit_iters=True)
v44 = sch.sample_categorical(candidates=[1, 2, 3, 4, 8, 16], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666], decision=0)
sch.annotate(block_or_loop=b36, ann_key="meta_schedule.cooperative_fetch", ann_val=v44)
b45 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared", consumer_blocks=[b0])
sch.compute_at(block=b45, loop=l29, preserve_unit_loops=True, index=-1)
l46, l47, l48, l49, l50, l51 = sch.get_loops(block=b45)
l52 = sch.fuse(l50, l51, preserve_unit_iters=True)
v53 = sch.sample_categorical(candidates=[1, 2, 3, 4, 8, 16], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666], decision=1)
sch.annotate(block_or_loop=b45, ann_key="meta_schedule.cooperative_fetch", ann_val=v53)
sch.reverse_compute_inline(block=b1)
v54 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0)
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v54)
2025-09-05 14:24:16 [INFO] [evolutionary_search.cc:713] Generating candidates......
2025-09-05 14:24:16 [INFO] [evolutionary_search.cc:715] Picked top 0 candidate(s) from database
2025-09-05 14:24:19 [INFO] [evolutionary_search.cc:533] Sample-Init-Population summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 325 failure(s)
2025-09-05 14:24:19 [INFO] [evolutionary_search.cc:723] Sampled 187 candidate(s)
2025-09-05 14:24:22 [INFO] [evolutionary_search.cc:621] Evolve iter #0 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 32 failure(s)
2025-09-05 14:24:24 [INFO] [evolutionary_search.cc:621] Evolve iter #1 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 38 failure(s)
2025-09-05 14:24:27 [INFO] [evolutionary_search.cc:621] Evolve iter #2 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 36 failure(s)
2025-09-05 14:24:30 [INFO] [evolutionary_search.cc:621] Evolve iter #3 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 35 failure(s)
2025-09-05 14:24:31 [INFO] [evolutionary_search.cc:649] Scores of the best 64 candidates:
[1 : 16]: 1.0000 0.9999 0.9995 0.9986 0.9972 0.9956 0.9953 0.9949 0.9945 0.9941 0.9927 0.9916 0.9910 0.9902 0.9899 0.9897
[17 : 32]: 0.9892 0.9892 0.9883 0.9881 0.9873 0.9857 0.9848 0.9846 0.9834 0.9820 0.9800 0.9798 0.9798 0.9770 0.9737 0.9729
[33 : 48]: 0.9704 0.9698 0.9681 0.9673 0.9665 0.9665 0.9663 0.9656 0.9644 0.9642 0.9636 0.9633 0.9629 0.9629 0.9621 0.9618
[49 : 64]: 0.9617 0.9611 0.9602 0.9587 0.9576 0.9574 0.9572 0.9565 0.9552 0.9541 0.9535 0.9531 0.9526 0.9519 0.9515 0.9510
2025-09-05 14:24:31 [INFO] [evolutionary_search.cc:727] Got 64 candidate(s) with evolutionary search
2025-09-05 14:24:31 [INFO] [evolutionary_search.cc:730] Sending 64 candidates(s) for measurement
2025-09-05 14:25:05 [INFO] [task_scheduler.cc:121] [Task #0: main] Trial #1: Error in building:
LocalBuilder: An exception occurred
Traceback (most recent call last):
File "C:\Users\myuname\work\tvm\python\tvm\exec\popen_worker.py", line 87, in main
result = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuname\work\tvm\python\tvm\meta_schedule\builder\local_builder.py", line 165, in <lambda>
lambda x: _worker_func(*x),
^^^^^^^^^^^^^^^^
File "C:\Users\myuname\work\tvm\python\tvm\meta_schedule\builder\local_builder.py", line 231, in _worker_func
rt_mod: Module = f_build(mod, target, _deserialize_params(params))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "tvm/_ffi/_cython/packed_func.pxi", line 339, in tvm._ffi._cy3.core.PackedFuncBase.__call__
File "tvm/_ffi/_cython/packed_func.pxi", line 270, in tvm._ffi._cy3.core.FuncCall
File "tvm/_ffi/_cython/packed_func.pxi", line 259, in tvm._ffi._cy3.core.FuncCall3
File "tvm/_ffi/_cython/base.pxi", line 185, in tvm._ffi._cy3.core.CHECK_CALL
File "C:\Users\myuname\work\tvm\python\tvm\_ffi\base.py", line 468, in raise_last_ffi_error
raise py_err
File "tvm/_ffi/_cython/packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "C:\Users\myuname\work\tvm\python\tvm\meta_schedule\builder\local_builder.py", line 261, in default_build
return tvm_build(mod, target=target)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuname\work\tvm\python\tvm\driver\build_module.py", line 59, in build
return tvm.tir.build(mod, target, pipeline)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuname\work\tvm\python\tvm\tir\build.py", line 186, in build
return tir_to_runtime(host_mod, device_mod_dict, target_host)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuname\work\tvm\python\tvm\tir\build.py", line 96, in tir_to_runtime
device_modules.append(codegen_build(device_mod, target))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuname\work\tvm\python\tvm\tir\build.py", line 80, in codegen_build
return bf(mod, target)
^^^^^^^^^^^^^^^
File "tvm/_ffi/_cython/packed_func.pxi", line 339, in tvm._ffi._cy3.core.PackedFuncBase.__call__
File "tvm/_ffi/_cython/packed_func.pxi", line 270, in tvm._ffi._cy3.core.FuncCall
File "tvm/_ffi/_cython/packed_func.pxi", line 259, in tvm._ffi._cy3.core.FuncCall3
File "tvm/_ffi/_cython/base.pxi", line 185, in tvm._ffi._cy3.core.CHECK_CALL
File "C:\Users\myuname\work\tvm\python\tvm\_ffi\base.py", line 468, in raise_last_ffi_error
raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
%v16char = OpTypeVector %char 16
File "C:\Users\myuname\work\tvm\src\target\spirv\spirv_utils.cc", line 102
InternalError: Check failed: res == SPV_SUCCESS (-14 vs. 0) : index=102 error:Having 16 components for TypeVector requires the Vector16 capability
The tuner fails to verify GPU code every time. In this case, it fails to satisfy vector capability. In other cases, it also fails with tvm.error.InternalError: Check failed: (me->coeff == 0 || info.factor() % me->coeff == 0) is false:
My build options are as follow:
USE_CUDA OFF
USE_NCCL OFF
USE_MSCCL OFF
USE_OPENCL OFF
USE_OPENCL_ENABLE_HOST_PTR OFF
USE_OPENCL_GTEST /path/to/opencl/gtest
USE_VULKAN ON
USE_KHRONOS_SPIRV OFF
USE_SPIRV_KHR_INTEGER_DOT_PRODUCT OFF
USE_METAL OFF
USE_ROCM OFF
USE_RCCL OFF
ROCM_PATH /opt/rocm
USE_HEXAGON OFF
USE_HEXAGON_SDK /path/to/sdk
USE_HEXAGON_RPC OFF
USE_HEXAGON_GTEST /path/to/hexagon/gtest
USE_HEXAGON_EXTERNAL_LIBS OFF
USE_RPC ON
USE_THREADS ON
USE_LLVM llvm-config --ignore-libllvm --link-static
USE_MLIR OFF
USE_STACKVM_RUNTIME OFF
USE_OPENMP none
TVM_DEBUG_WITH_ABI_CHANGE OFF
TVM_LOG_BEFORE_THROW OFF
USE_RTTI ON
USE_MSVC_MT OFF
INSTALL_DEV OFF
HIDE_PRIVATE_SYMBOLS ON
USE_FALLBACK_STL_MAP OFF
INDEX_DEFAULT_I64 ON
USE_LIBBACKTRACE AUTO
BACKTRACE_ON_SEGFAULT OFF
BUILD_STATIC_RUNTIME OFF
BUILD_DUMMY_LIBTVM OFF
USE_PAPI OFF
USE_GTEST AUTO
USE_CUSTOM_LOGGING OFF
USE_ALTERNATIVE_LINKER AUTO
USE_CCACHE AUTO
DLPACK_PATH 3rdparty/dlpack/include
DMLC_PATH 3rdparty/dmlc-core/include
RANG_PATH 3rdparty/rang/include
COMPILER_RT_PATH 3rdparty/compiler-rt
PICOJSON_PATH 3rdparty/picojson
USE_BYODT_POSIT OFF
USE_BLAS none
USE_AMX OFF
USE_MKL OFF
USE_DNNL OFF
USE_CUDNN OFF
USE_CUBLAS OFF
USE_NVTX OFF
USE_CUTLASS OFF
USE_THRUST OFF
USE_CURAND OFF
USE_MIOPEN OFF
USE_ROCBLAS OFF
USE_HIPBLAS OFF
USE_SORT ON
USE_NNPACK OFF
USE_LIBTORCH OFF
USE_RANDOM ON
USE_CPP_RPC OFF
USE_IOS_RPC OFF
USE_TFLITE OFF
USE_TENSORFLOW_PATH none
USE_COREML OFF
USE_BNNS OFF
USE_ARM_COMPUTE_LIB OFF
USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF
USE_TENSORRT_CODEGEN OFF
USE_TENSORRT_RUNTIME OFF
USE_NNAPI_CODEGEN OFF
USE_NNAPI_RUNTIME OFF
USE_RUST_EXT OFF
SUMMARIZE OFF
USE_CLML OFF
USE_CLML_GRAPH_EXECUTOR OFF
USE_UMA OFF
USE_MSC OFF
USE_MRVL OFF
USE_NVSHMEM OFF
Some other information: I tested on TVM v0.21.0 as well, but the result is the same. My Vulkan SDK version is 1.4.321.1, and also tested on Vulkan 1.3.243.0, no luck either.