Hi, I am working on a C++ project that involves overlap data transfer on gpus. I can do runtime like below
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]) );
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]) );
}
But I didn’t find how to set TVMStreamHandler
for these multiple stream instances yet. Currently, all of them are running on the default cuda stream. Is it possible to set multiple streams to achieve async and parallel inference in tvm ?
Appreciate for any explanation of this or any suggestions about this issue. Thanks in advance!