Hey when generating code for an opencl target is there any way to set the local_work_size argument (which is set in the clEnqueueNDRangeKernel function) from the tvm expressions ?.
When I tried executing this code, it only used one core
import tvm from tvm import te import numpy as np
n = te.var(“n”) A = te.placeholder((n,2),name = ‘A’) B = te.placeholder((n,2),name = ‘B’) C = te.compute(A.shape,lambda i,j, : A[i,j] + B[i,j], name = ‘C’)
s = te.create_schedule(C.op)
x, y = s[C].op.axis
xo, x1 = s[C].split(x, factor = 2) x1, x2 = s[C].split(x1, factor = 4) yo, y1 = s[C].split(y, factor = 2) y1, y2 = s[C].split(y1, factor = 4) #newtx1, newtx2 = s[C].split(tx, factor = 2) #s[C].bind(bx, te.thread_axis(“blockIdx.x”)) #s[C].bind(tx, te.thread_axis(“threadIdx.x”)) s[C].reorder(xo, yo, x1, y1, x2, y2) s[C].bind(xo, te.thread_axis(“blockIdx.x”)) s[C].bind(yo, te.thread_axis(“blockIdx.y”)) s[C].bind(x1, te.thread_axis(“threadIdx.x”)) s[C].bind(y1, te.thread_axis(“threadIdx.y”))
fadd_cl = tvm.build(s, [A,B,C], target= “rsim”, name = “myadd”) print(tvm.lower(s, [A, B, C], simple_mode=True))
ctx = tvm.context(“rsim”)
#print(ctx.device_type) #print(ctx.device_name)
#np.random.uniform(size=n) n = 16 a = tvm.nd.array(np.random.uniform(size=(n,2)).astype(np.float32), ctx) b = tvm.nd.array(np.random.uniform(size=(n,2)).astype(np.float32), ctx) c = tvm.nd.array(np.zeros((n,2), dtype=C.dtype), ctx) fadd_cl(a,b,c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) print(c) print(tvm.lower(s, [A, B, C], simple_mode=True))
but when I modified only to add two vectors using only one blockidx and threadidx it used more cores