I think this is a useful trick that worth a tutorial.
The example shows how to transpose the layout of an intermediate buffer
import tvm
n = 10
m = 20
A = tvm.placeholder((n, m), name='A')
B = tvm.compute((n, m), lambda i, j: A[i][j], name='B')
C = tvm.compute((n, m), lambda i, j: B[i][j], name='C')
s = tvm.create_schedule([C.op])
print(tvm.lower(s, [A, C], simple_mode=True))
print("======================================\n")
i, j = s[B].op.axis
s[B].reorder(j, i) # transpose
BB = s.cache_write(B, 'global')
s[B].compute_inline()
print(tvm.lower(s, [A, C], simple_mode=True))
output
// attr [B] storage_scope = "global"
allocate B[float32 * 10 * 20]
produce B {
for (i, 0, 10) {
for (j, 0, 20) {
B[((i*20) + j)] = A[((i*20) + j)]
}
}
}
produce C {
for (i, 0, 10) {
for (j, 0, 20) {
C[((i*20) + j)] = B[((i*20) + j)]
}
}
}
======================================
// attr [B.global] storage_scope = "global"
allocate B.global[float32 * 20 * 10]
produce B.global { // B.global is transposed
for (j.c, 0, 20) {
for (i.c, 0, 10) {
B.global[((j.c*10) + i.c)] = A[(j.c + (i.c*20))]
}
}
}
produce C {
for (i, 0, 10) {
for (j, 0, 20) {
C[((i*20) + j)] = B.global[(i + (j*10))]
}
}
}