This is a test case
import tvm
def ir(in_buf, out_buf):
ib = tvm.ir_builder.create()
p_in = ib.buffer_ptr(in_buf)
p_out = ib.buffer_ptr(out_buf)
nthreads = 256
nblocks = in_buf.shape[0] // nthreads
bx = tvm.thread_axis("blockIdx.x")
tx = tvm.thread_axis("threadIdx.x")
ib.scope_attr(tx, "thread_extent", nthreads)
ib.scope_attr(bx, "thread_extent", nblocks)
ib.emit(tvm.make.Call(None, 'tvm_global_barrier_kinit', None, tvm.expr.Call.Intrinsic, None, 0))
i = bx * nthreads + tx
p_out[i] = p_in[i]
ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
tvm.convert(['global', True, nblocks]),
tvm.expr.Call.Intrinsic, None, 0))
return ib.get()
a = tvm.placeholder((1024,))
b = tvm.extern([a.shape], [a], lambda ins, outs: ir(ins[0], outs[0]))
s = tvm.create_schedule(b.op)
print(tvm.build(s, [a, b], 'cuda').imported_modules[0].get_source())
which produces
extern "C" __device__ unsigned __tvm_global_barrier_state;
extern "C" __global__ void default_function_kernel0( float* __restrict__ extern1, float* __restrict__ placeholder) {
__shared__ unsigned __barrier_expect;
if (threadIdx.x == 0) {
__barrier_expect = 0;
}
extern1[(((int)threadIdx.x) + (((int)blockIdx.x) * 256))] = placeholder[(((int)threadIdx.x) + (((int)blockIdx.x) * 256))];
__threadfence_system();
if ((bool)1) {
atomicAdd(&__tvm_global_barrier_state, 1);
volatile unsigned* pf = &__tvm_global_barrier_state;
__barrier_expect += 4;
while (pf[0] < __barrier_expect);
}
__syncthreads();
}