TVM not generating legal code for Vulkan on Windows

Hi everyone, I’m trying to tune a simple TIR module with meta_schedule.tune_tir for Vulkan on both Linux and Windows. Both TVM are built from source, on version 0.20.0. The Linux one works fine and tunes successfully, but the Windows one doesn’t seem to generate legal GPU code, so I can see errors like this in tune logs:

2025-09-05 14:24:16 [INFO] [task_scheduler.cc:164] Total 1 design space(s) generated
2025-09-05 14:24:16 [INFO] [task_scheduler.cc:170] Design space #0:
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(X: T.Buffer((1, 8192), "int8"), W: T.Buffer((8192, 1024), "int8"), B: T.Buffer((1, 1024), "int16"), Y: T.Buffer((1, 1024), "int16")):
        T.func_attr({"tir.const_params": ["B", "W"], "tir.noalias": T.bool(True)})
        with T.block("root"):
            T.reads()
            T.writes()
            T.block_attr({"meta_schedule.unroll_explicit": 0})
            sum_local = T.alloc_buffer((1, 1024), "int16", scope="local")
            X_shared = T.alloc_buffer((1, 8192), "int8", scope="shared")
            W_shared = T.alloc_buffer((8192, 1024), "int8", scope="shared")
            for i_0_j_0_fused in T.thread_binding(16, thread="blockIdx.x"):
                for i_1_j_1_fused in T.thread_binding(8, thread="vthread.x"):
                    for i_2_j_2_fused in T.thread_binding(1, thread="threadIdx.x"):
                        for k_0 in range(128):
                            for ax0_ax1_fused in range(64):
                                with T.block("X_shared"):
                                    v0 = T.axis.spatial(1, 0)
                                    v1 = T.axis.spatial(8192, k_0 * 64 + ax0_ax1_fused)
                                    T.reads(X[v0, v1])
                                    T.writes(X_shared[v0, v1])
                                    T.block_attr({"meta_schedule.cooperative_fetch": 1})
                                    X_shared[v0, v1] = X[v0, v1]
                            for ax0_ax1_fused in range(4096):
                                with T.block("W_shared"):
                                    v0 = T.axis.spatial(8192, k_0 * 64 + ax0_ax1_fused // 64)
                                    v1 = T.axis.spatial(1024, i_0_j_0_fused * 64 + ax0_ax1_fused % 64)
                                    T.reads(W[v0, v1])
                                    T.writes(W_shared[v0, v1])
                                    T.block_attr({"meta_schedule.cooperative_fetch": 2})
                                    W_shared[v0, v1] = W[v0, v1]
                            for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(8, 1, 8, 8, 1, 1):
                                with T.block("B_update"):
                                    vi = T.axis.spatial(1, i_3 + i_4)
                                    vj = T.axis.spatial(1024, i_0_j_0_fused * 64 + i_1_j_1_fused * 8 + j_3 + j_4)
                                    vk = T.axis.reduce(8192, k_0 * 64 + k_1 * 8 + k_2)
                                    T.reads(X_shared[vi, vk], W_shared[vk, vj])
                                    T.writes(sum_local[vi, vj])
                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive": 1024, "meta_schedule.thread_extent_low_inclusive": 1, "meta_schedule.tiling_structure": "SSSRRSRS"})
                                    with T.init():
                                        sum_local[vi, vj] = T.int16(0)
                                    sum_local[vi, vj] = sum_local[vi, vj] + T.Cast("int16", X_shared[vi, vk]) * T.Cast("int16", W_shared[vk, vj])
                        for ax0, ax1 in T.grid(1, 8):
                            with T.block("sum_local"):
                                v0 = T.axis.spatial(1, ax0)
                                v1 = T.axis.spatial(1024, i_0_j_0_fused * 64 + i_1_j_1_fused * 8 + ax1)
                                T.reads(sum_local[v0, v1], B[v0, v1])
                                T.writes(Y[v0, v1])
                                Y[v0, v1] = sum_local[v0, v1] + B[v0, v1]
b0 = sch.get_block(name="B_update", func_name="main")
b1 = sch.get_block(name="Y_update", func_name="main")
b2 = sch.get_block(name="root", func_name="main")
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
l3, l4, l5 = sch.get_loops(block=b0)
v6, v7, v8, v9, v10 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1])
l11, l12, l13, l14, l15 = sch.split(loop=l3, factors=[v6, v7, v8, v9, v10], preserve_unit_iters=True, disable_predication=False)
v16, v17, v18, v19, v20 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[16, 8, 1, 8, 1])
l21, l22, l23, l24, l25 = sch.split(loop=l4, factors=[v16, v17, v18, v19, v20], preserve_unit_iters=True, disable_predication=False)
v26, v27, v28 = sch.sample_perfect_tile(loop=l5, n=3, max_innermost_factor=64, decision=[128, 8, 8])
l29, l30, l31 = sch.split(loop=l5, factors=[v26, v27, v28], preserve_unit_iters=True, disable_predication=False)
sch.reorder(l11, l21, l12, l22, l13, l23, l29, l30, l14, l24, l31, l15, l25)
l32 = sch.fuse(l11, l21, preserve_unit_iters=True)
sch.bind(loop=l32, thread_axis="blockIdx.x")
l33 = sch.fuse(l12, l22, preserve_unit_iters=True)
sch.bind(loop=l33, thread_axis="vthread.x")
l34 = sch.fuse(l13, l23, preserve_unit_iters=True)
sch.bind(loop=l34, thread_axis="threadIdx.x")
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=1)
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)
b35 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")
sch.reverse_compute_at(block=b35, loop=l34, preserve_unit_loops=True, index=-1)
b36 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared", consumer_blocks=[b0])
sch.compute_at(block=b36, loop=l29, preserve_unit_loops=True, index=-1)
l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b36)
l43 = sch.fuse(l41, l42, preserve_unit_iters=True)
v44 = sch.sample_categorical(candidates=[1, 2, 3, 4, 8, 16], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666], decision=0)
sch.annotate(block_or_loop=b36, ann_key="meta_schedule.cooperative_fetch", ann_val=v44)
b45 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared", consumer_blocks=[b0])
sch.compute_at(block=b45, loop=l29, preserve_unit_loops=True, index=-1)
l46, l47, l48, l49, l50, l51 = sch.get_loops(block=b45)
l52 = sch.fuse(l50, l51, preserve_unit_iters=True)
v53 = sch.sample_categorical(candidates=[1, 2, 3, 4, 8, 16], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666], decision=1)
sch.annotate(block_or_loop=b45, ann_key="meta_schedule.cooperative_fetch", ann_val=v53)
sch.reverse_compute_inline(block=b1)
v54 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0)
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v54)
2025-09-05 14:24:16 [INFO] [evolutionary_search.cc:713] Generating candidates......
2025-09-05 14:24:16 [INFO] [evolutionary_search.cc:715] Picked top 0 candidate(s) from database
2025-09-05 14:24:19 [INFO] [evolutionary_search.cc:533] Sample-Init-Population summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 325 failure(s)
2025-09-05 14:24:19 [INFO] [evolutionary_search.cc:723] Sampled 187 candidate(s)
2025-09-05 14:24:22 [INFO] [evolutionary_search.cc:621] Evolve iter #0 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 32 failure(s)
2025-09-05 14:24:24 [INFO] [evolutionary_search.cc:621] Evolve iter #1 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 38 failure(s)
2025-09-05 14:24:27 [INFO] [evolutionary_search.cc:621] Evolve iter #2 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 36 failure(s)
2025-09-05 14:24:30 [INFO] [evolutionary_search.cc:621] Evolve iter #3 done. Summary:
Postproc #0 [meta_schedule.DisallowDynamicLoop(0000029E6F0C8BF8)]: 0 failure(s)
Postproc #1 [meta_schedule.RewriteCooperativeFetch(0000029E6F847358)]: 0 failure(s)
Postproc #2 [meta_schedule.RewriteUnboundBlock(0000029E6F847448)]: 0 failure(s)
Postproc #3 [meta_schedule.RewriteParallelVectorizeUnroll(0000029E6F0C8EB8)]: 0 failure(s)
Postproc #4 [meta_schedule.RewriteReductionBlock(0000029E6F0C8998)]: 0 failure(s)
Postproc #5 [meta_schedule.VerifyGPUCode(0000029E6F8C18F8)]: 35 failure(s)
2025-09-05 14:24:31 [INFO] [evolutionary_search.cc:649] Scores of the best 64 candidates:
[1 : 16]:	1.0000  0.9999  0.9995  0.9986  0.9972  0.9956  0.9953  0.9949  0.9945  0.9941  0.9927  0.9916  0.9910  0.9902  0.9899  0.9897
[17 : 32]:	0.9892  0.9892  0.9883  0.9881  0.9873  0.9857  0.9848  0.9846  0.9834  0.9820  0.9800  0.9798  0.9798  0.9770  0.9737  0.9729
[33 : 48]:	0.9704  0.9698  0.9681  0.9673  0.9665  0.9665  0.9663  0.9656  0.9644  0.9642  0.9636  0.9633  0.9629  0.9629  0.9621  0.9618
[49 : 64]:	0.9617  0.9611  0.9602  0.9587  0.9576  0.9574  0.9572  0.9565  0.9552  0.9541  0.9535  0.9531  0.9526  0.9519  0.9515  0.9510
2025-09-05 14:24:31 [INFO] [evolutionary_search.cc:727] Got 64 candidate(s) with evolutionary search
2025-09-05 14:24:31 [INFO] [evolutionary_search.cc:730] Sending 64 candidates(s) for measurement
2025-09-05 14:25:05 [INFO] [task_scheduler.cc:121] [Task #0: main] Trial #1: Error in building:
LocalBuilder: An exception occurred
Traceback (most recent call last):
  File "C:\Users\myuname\work\tvm\python\tvm\exec\popen_worker.py", line 87, in main
    result = fn(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\myuname\work\tvm\python\tvm\meta_schedule\builder\local_builder.py", line 165, in <lambda>
    lambda x: _worker_func(*x),
              ^^^^^^^^^^^^^^^^
  File "C:\Users\myuname\work\tvm\python\tvm\meta_schedule\builder\local_builder.py", line 231, in _worker_func
    rt_mod: Module = f_build(mod, target, _deserialize_params(params))
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "tvm/_ffi/_cython/packed_func.pxi", line 339, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/packed_func.pxi", line 270, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/packed_func.pxi", line 259, in tvm._ffi._cy3.core.FuncCall3
  File "tvm/_ffi/_cython/base.pxi", line 185, in tvm._ffi._cy3.core.CHECK_CALL
  File "C:\Users\myuname\work\tvm\python\tvm\_ffi\base.py", line 468, in raise_last_ffi_error
    raise py_err
  File "tvm/_ffi/_cython/packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
  File "C:\Users\myuname\work\tvm\python\tvm\meta_schedule\builder\local_builder.py", line 261, in default_build
    return tvm_build(mod, target=target)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\myuname\work\tvm\python\tvm\driver\build_module.py", line 59, in build
    return tvm.tir.build(mod, target, pipeline)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\myuname\work\tvm\python\tvm\tir\build.py", line 186, in build
    return tir_to_runtime(host_mod, device_mod_dict, target_host)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\myuname\work\tvm\python\tvm\tir\build.py", line 96, in tir_to_runtime
    device_modules.append(codegen_build(device_mod, target))
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\myuname\work\tvm\python\tvm\tir\build.py", line 80, in codegen_build
    return bf(mod, target)
           ^^^^^^^^^^^^^^^
  File "tvm/_ffi/_cython/packed_func.pxi", line 339, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/packed_func.pxi", line 270, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/packed_func.pxi", line 259, in tvm._ffi._cy3.core.FuncCall3
  File "tvm/_ffi/_cython/base.pxi", line 185, in tvm._ffi._cy3.core.CHECK_CALL
  File "C:\Users\myuname\work\tvm\python\tvm\_ffi\base.py", line 468, in raise_last_ffi_error
    raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
  %v16char = OpTypeVector %char 16
  File "C:\Users\myuname\work\tvm\src\target\spirv\spirv_utils.cc", line 102
InternalError: Check failed: res == SPV_SUCCESS (-14 vs. 0) :  index=102 error:Having 16 components for TypeVector requires the Vector16 capability

The tuner fails to verify GPU code every time. In this case, it fails to satisfy vector capability. In other cases, it also fails with tvm.error.InternalError: Check failed: (me->coeff == 0 || info.factor() % me->coeff == 0) is false:

My build options are as follow:

USE_CUDA OFF 
USE_NCCL OFF 
USE_MSCCL OFF 
USE_OPENCL OFF 
USE_OPENCL_ENABLE_HOST_PTR OFF 
USE_OPENCL_GTEST /path/to/opencl/gtest 
USE_VULKAN ON 
USE_KHRONOS_SPIRV OFF 
USE_SPIRV_KHR_INTEGER_DOT_PRODUCT OFF 
USE_METAL OFF 
USE_ROCM OFF 
USE_RCCL OFF 
ROCM_PATH /opt/rocm 
USE_HEXAGON OFF 
USE_HEXAGON_SDK /path/to/sdk 
USE_HEXAGON_RPC OFF 
USE_HEXAGON_GTEST /path/to/hexagon/gtest 
USE_HEXAGON_EXTERNAL_LIBS OFF 
USE_RPC ON 
USE_THREADS ON 
USE_LLVM llvm-config --ignore-libllvm --link-static 
USE_MLIR OFF 
USE_STACKVM_RUNTIME OFF 
USE_OPENMP none 
TVM_DEBUG_WITH_ABI_CHANGE OFF 
TVM_LOG_BEFORE_THROW OFF 
USE_RTTI ON 
USE_MSVC_MT OFF 
INSTALL_DEV OFF 
HIDE_PRIVATE_SYMBOLS ON 
USE_FALLBACK_STL_MAP OFF 
INDEX_DEFAULT_I64 ON 
USE_LIBBACKTRACE AUTO 
BACKTRACE_ON_SEGFAULT OFF 
BUILD_STATIC_RUNTIME OFF 
BUILD_DUMMY_LIBTVM OFF 
USE_PAPI OFF 
USE_GTEST AUTO 
USE_CUSTOM_LOGGING OFF 
USE_ALTERNATIVE_LINKER AUTO 
USE_CCACHE AUTO 
DLPACK_PATH 3rdparty/dlpack/include 
DMLC_PATH 3rdparty/dmlc-core/include 
RANG_PATH 3rdparty/rang/include 
COMPILER_RT_PATH 3rdparty/compiler-rt 
PICOJSON_PATH 3rdparty/picojson 
USE_BYODT_POSIT OFF 
USE_BLAS none 
USE_AMX OFF 
USE_MKL OFF 
USE_DNNL OFF 
USE_CUDNN OFF 
USE_CUBLAS OFF 
USE_NVTX OFF 
USE_CUTLASS OFF 
USE_THRUST OFF 
USE_CURAND OFF 
USE_MIOPEN OFF 
USE_ROCBLAS OFF 
USE_HIPBLAS OFF 
USE_SORT ON 
USE_NNPACK OFF 
USE_LIBTORCH OFF 
USE_RANDOM ON 
USE_CPP_RPC OFF 
USE_IOS_RPC OFF 
USE_TFLITE OFF 
USE_TENSORFLOW_PATH none 
USE_COREML OFF 
USE_BNNS OFF 
USE_ARM_COMPUTE_LIB OFF 
USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF 
USE_TENSORRT_CODEGEN OFF 
USE_TENSORRT_RUNTIME OFF 
USE_NNAPI_CODEGEN OFF 
USE_NNAPI_RUNTIME OFF 
USE_RUST_EXT OFF 
SUMMARIZE OFF 
USE_CLML OFF 
USE_CLML_GRAPH_EXECUTOR OFF 
USE_UMA OFF 
USE_MSC OFF 
USE_MRVL OFF 
USE_NVSHMEM OFF

Some other information: I tested on TVM v0.21.0 as well, but the result is the same. My Vulkan SDK version is 1.4.321.1, and also tested on Vulkan 1.3.243.0, no luck either.