How to control data layout of intermediate values

I think this is a useful trick that worth a tutorial.

The example shows how to transpose the layout of an intermediate buffer

import tvm

n = 10
m = 20

A = tvm.placeholder((n, m), name='A')
B = tvm.compute((n, m), lambda i, j: A[i][j], name='B')
C = tvm.compute((n, m), lambda i, j: B[i][j], name='C')

s = tvm.create_schedule([C.op])
print(tvm.lower(s, [A, C], simple_mode=True))

print("======================================\n")


i, j = s[B].op.axis
s[B].reorder(j, i)     # transpose
BB = s.cache_write(B, 'global')
s[B].compute_inline()

print(tvm.lower(s, [A, C], simple_mode=True))

output

// attr [B] storage_scope = "global"
allocate B[float32 * 10 * 20]
produce B {
  for (i, 0, 10) {
    for (j, 0, 20) {
      B[((i*20) + j)] = A[((i*20) + j)]
    }
  }
}
produce C {
  for (i, 0, 10) {
    for (j, 0, 20) {
      C[((i*20) + j)] = B[((i*20) + j)]
    }
  }
}

======================================

// attr [B.global] storage_scope = "global"
allocate B.global[float32 * 20 * 10]
produce B.global {                     // B.global is transposed
  for (j.c, 0, 20) {
    for (i.c, 0, 10) {
      B.global[((j.c*10) + i.c)] = A[(j.c + (i.c*20))]
    }
  }
}
produce C {
  for (i, 0, 10) {
    for (j, 0, 20) {
      C[((i*20) + j)] = B.global[(i + (j*10))]
    }
  }
}