How to use compute_at with opencl

chrisn · September 26, 2018, 4:13pm

Trying to implement a correct (i.e. unlimited filter size) bilinear filter:

rows = tvm.var("rows")
cols = tvm.var("cols")
chans = tvm.var("chans")
input_vec = tvm.placeholder((rows,cols,chans), dtype="float32")
kernel = tvm.compute((cols,chans)
                     , lambda c, cc: 1.0 * c * cc
                     , name="kern_vec")
result = tvm.compute((rows,cols,chans)
                     , lambda y, x, c: input_vec[y,x,c] * kernel[x,c]
                     , name="answer")

sched = tvm.create_schedule(result.op)
result_stage = sched[result]
kernel_stage = sched[kernel]
kernel_stage.compute_at(result_stage, result.op.axis[1])
result_stage.bind(result.op.axis[0], tvm.thread_axis("blockIdx.x"))
result_stage.bind(result.op.axis[1], tvm.thread_axis("threadIdx.x"))
fun = tvm.build(sched, [input_vec], "opencl", name="test_compute_at")

I get:

Traceback (most recent call last):
  File "compute_at_gpu.py", line 21, in <module>
    fun = tvm.build(sched, [input_vec], "opencl", name="test_compute_at")
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/build_module.py", line 519, in build
    mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/codegen.py", line 20, in build_module
    return _Build(lowered_func, target)
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/_ffi/_ctypes/function.py", line 185, in __call__
    ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/_ffi/base.py", line 66, in check_call
    raise TVMError(py_str(_LIB.TVMGetLastError()))
tvm._ffi.base.TVMError: [12:35:35] /home/chrisn/dev/tvm-clj/tvm/src/codegen/codegen_c.cc:725: Check failed: constant_size > 0 (0 vs. 0) Can only handle constant size stack allocation for now

How can I refactor the code to allow the kernel to be calculated specifically at that point?

masahi · September 26, 2018, 10:50pm

not sure, but I think these are fishy.

Variables corresponding to shape dimension should have concrete integer values, rather than opaque “tvm.Var”.

rows = tvm.var("rows")
cols = tvm.var("cols")
chans = tvm.var("chans")

And the line

fun = tvm.build(sched, [input_vec], "opencl", name="test_compute_at")

should be

fun = tvm.build(sched, [input_vec, result], "opencl", name="test_compute_at")

chrisn · September 27, 2018, 12:05am

Nothing is fishy :-)!! TVM will infer those variables as they are used in the dimension of the input. They aren’t the problem.

I am looking for this transformation:

(before compute_at)

// attr [kern_vec] storage_scope = "global"
allocate kern_vec[float32 * cols * chans]
produce kern_vec {
  for (c, 0, cols) {
    for (cc, 0, chans) {
      kern_vec[((c*chans) + cc)] = (float32(c)*float32(cc))
    }
  }
}
produce answer {
  for (y, 0, rows) {
    for (x, 0, cols) {
      for (c, 0, chans) {
        answer[((((y*cols) + x)*chans) + c)] = (placeholder[((((y*cols) + x)*chans) + c)]*kern_vec[((x*chans) + c)])
      }
    }
  }
}

(after compute at)

// attr [kern_vec] storage_scope = "global"
allocate kern_vec[float32 * 1 * chans]
produce answer {
  for (y, 0, rows) {
    for (x, 0, cols) {
      produce kern_vec {
        for (cc, 0, chans) {
          kern_vec[cc] = (float32(x)*float32(cc))
        }
      }
      for (c, 0, chans) {
        answer[((((y*cols) + x)*chans) + c)] = (placeholder[((((y*cols) + x)*chans) + c)]*kern_vec[c])
      }
    }
  }
}

Note that this means you are launching fewer device kernels.

Here is a fixed example:

import tvm


def print_schedule(sched, arglist):
    print(tvm.lower(sched, arglist, simple_mode=True))


rows = tvm.var("rows")
cols = tvm.var("cols")
chans = tvm.var("chans")

input_vec = tvm.placeholder((rows,cols,chans), dtype="float32")
kernel = tvm.compute((cols,chans)
                     , lambda c, cc: 1.0 * c * cc
                     , name="kern_vec")

result = tvm.compute((rows,cols,chans)
                     , lambda y, x, c: input_vec[y,x,c] * kernel[x,c]
                     , name="answer")

sched = tvm.create_schedule(result.op)
result_stage = sched[result]
kernel_stage = sched[kernel]

arglist=[input_vec,result]

print_schedule(sched, arglist)

kernel_stage.compute_at(result_stage, result.op.axis[1])

print_schedule(sched, arglist)

result_stage.bind(result.op.axis[0], tvm.thread_axis("blockIdx.x"))
result_stage.bind(result.op.axis[1], tvm.thread_axis("threadIdx.x"))

fun = tvm.build(sched, arglist, "opencl", name="test_compute_at")

For the real use case, please see here:

github.com

tech-ascent/tvm-clj/blob/master/src/tvm_clj/image/bilinear_reduce.clj#L194


(let [{:keys [input
              output
              kern-width
              kern-height
              x-ratio
              y-ratio
              reduce-op
              final-op
              kern-x-op
              kern-y-op]}
      (bilinear-reduction-kernel-reduce-fn img-dtype)


      arglist [input output
               kern-width x-ratio
               kern-height y-ratio]
      fn-name "bilinear_reduce"
      schedule (api/create-schedule [final-op])
      reduce-axis (:axis reduce-op)
      [reduce-y-axis reduce-x-axis reduce-channels] reduce-axis
      stage-map (get schedule :stage_map)
      kern-x-stage (get stage-map kern-x-op)

It works and produces correct output (I have clojure bindings for tvm).

So again, I can’t use compute_at with opencl in either clojure or in python due to specifically:

tvm._ffi.base.TVMError: [18:04:40] /home/chrisn/dev/tvm-clj/tvm/src/codegen/codegen_c.cc:725: Check failed: constant_size > 0 (0 vs. 0) Can only handle constant size stack allocation for now

This works fine on cpu and fails for opencl. I haven’t tested cuda.

masahi · September 27, 2018, 12:33am

Can you try replacing rows, cols, and chans with integers?

The error message you got is saying it cannot allocate non-constant size local array in the opencl kernel corresponding to “result”. If you do compute_at on kern_vec, kern_vec needs to be statically allocated inside “answer” kernel. So for tvm to do correct bounds inference, you need to specify concrete shape dimension during codegen.

For cpu there is no such restriction. I think that is the reason your script works on cpu.

chrisn · September 27, 2018, 12:38am

Hmm, hopping on a flight.

If that is the case then I cannot use this type of code for images where you are getting arbitrary sizes. The bounds of the temporaries are constant throughout the kernel launches. I assumed that if the size was small enough it would work.

Hopping on a flight, will try a few things.

masahi · September 27, 2018, 12:41am

yes, your kernels needs to be compiled for each input shape. This is a limitation of tvm. Halide doesn’t have such limitation.

chrisn · September 27, 2018, 5:34pm

This appears affect all forms of device shared memory. I got a similar error trying to setup cache_write.

A possible work around would be to provide an upper bound:

import tvm


def print_schedule(sched, arglist):
    print(tvm.lower(sched, arglist, simple_mode=True))

    
rows = tvm.var("rows")
cols = tvm.var("cols")
max_chans = tvm.const(5)
chans = tvm.var("chans")

input_vec = tvm.placeholder((rows,cols,chans), dtype="float32")
kernel = tvm.compute((cols,chans)
                     , lambda c, cc: 1.0 * c * cc
                     , name="kern_vec")

result = tvm.compute((rows,cols,chans)
                     , lambda y, x, c: input_vec[y,x,c] * kernel[x, tvm.min(max_chans, tvm.max(0, c))]
                     , name="answer")

sched = tvm.create_schedule(result.op)
result_stage = sched[result]
kernel_stage = sched[kernel]

arglist=[input_vec,result]

kernel_stage.compute_at(result_stage, result.op.axis[0])

print_schedule(sched, arglist)

result_stage.bind(result.op.axis[0], tvm.thread_axis("blockIdx.x"))
result_stage.bind(result.op.axis[1], tvm.thread_axis("threadIdx.x"))

fun = tvm.build(sched, arglist, "opencl", name="test_compute_at")

// attr [kern_vec] storage_scope = "global"
allocate kern_vec[float32 * cols * (min(max((chans + -1), 0), 5) + 1)]
produce answer {
  for (y, 0, rows) {
    produce kern_vec {
      for (c, 0, cols) {
        for (cc, 0, (min(max((chans + -1), 0), 5) + 1)) {
          if (likely((cc < chans))) {
            kern_vec[((c*(min(max((chans + -1), 0), 5) + 1)) + cc)] = (float32(c)*float32(cc))
          }
        }
      }
    }
    for (x, 0, cols) {
      for (c, 0, chans) {
        answer[((((y*cols) + x)*chans) + c)] = (placeholder[((((y*cols) + x)*chans) + c)]*kern_vec[(min(max(c, 0), 5) + (x*(min(max((chans + -1), 0), 5) + 1)))])
      }
    }
  }
}

This produces the same error:

tvm._ffi.base.TVMError: [11:29:01] /home/chrisn/dev/tech/tvm-clj/tvm/src/codegen/codegen_c.cc:725: Check failed: constant_size > 0 (0 vs. 0) Can only handle constant size stack allocation for now

Is there another way to communicate this bound? chans is bounded from 0-5 so the error in this case it is clearly spurious.
Why is this limitation there, out of curiosity?

Thanks for the pointer, I think TVM has far, far more promise in the long run but I can look into building nice bindings for Halide. I would request that this restriction to be lifted for the backends that support it and I am willing to do some of the work but would need a few pointers to get it done. At least it would be ideal if the work-around should be effective.

Thanks for your help so far, btw.

masahi · September 27, 2018, 10:23pm

Determining shared memory allocation size at runtime is technically possible but I don’t know tvm supports this (probably not). For local memory, array size needs to be constant (you can’t do malloc inside gpu kernels). This is not a TVM limitation but that of gpu architecture / programming model.

In

kernel[x, tvm.min(max_chans, tvm.max(0, c))]

You are still indexing kernel with x, whose bound TVM doesn’t know. So TVM cannot determine the maximum size for kernel. And I don’t think TVM can do bounds inference involving min and max expression.

I believe that is for performance reason. If TVM knows the shape of input and output at compile time, it can generate better optimized code. Doing most of indexing math at compile time, automatically determining minimum shared and local memory size, etc.

You may be able to use tvm.decl_buffer to achieve what you want (fixed local or shared memory size with arbitrary input shape). I’ve never used this API so I can’t help on this.

chrisn · September 28, 2018, 6:04pm

Good pointer.

Bounds inference involving min,max would allow me to work around this. I can get rid of the x dimension with compute_at.

Thanks for the pointer to decl buffer. I will look it up.

Honestly, it is still very fast. I doubt I can get full occupancy of the GPU but on my desktop (with a 1070) I am getting a 5x speedup from the cpu version. For full occupancy you would have to do a lot more work and this is for a series of blog posts.

It is unclear to me how to request additions or updates to tvm. Should I just attempt them and see what happens? What if I wanted to add min, max evaluation to bounds inference?

masahi · September 29, 2018, 12:01am

You can open a new thread on this forum to discuss your request. In the case of bounds inference with min or max, TVM may in fact be capable already. So you should try that first.

chrisn · September 30, 2018, 7:26pm

It is not:

import tvm


def print_schedule(sched, arglist):
    print(tvm.lower(sched, arglist, simple_mode=True))


rows = tvm.var("rows")
cols = tvm.var("cols")
max_chans = tvm.const(5)
chans = tvm.var("chans")

input_vec = tvm.placeholder((rows,cols,chans), dtype="float32")
kernel = tvm.compute((cols,chans)
                     , lambda c, cc: 1.0 * c * cc
                     , name="kern_vec")

result = tvm.compute((rows,cols,chans)
                     , lambda y, x, c: input_vec[y,x,c] * kernel[x, tvm.min(max_chans, tvm.max(0, c))]
                     , name="answer")

sched = tvm.create_schedule(result.op)
result_stage = sched[result]
kernel_stage = sched[kernel]

arglist=[input_vec,result]

kernel_stage.compute_at(result_stage, result.op.axis[1])

print_schedule(sched, arglist)

result_stage.bind(result.op.axis[0], tvm.thread_axis("blockIdx.x"))
result_stage.bind(result.op.axis[1], tvm.thread_axis("threadIdx.x"))


fun = tvm.build(sched, arglist, "opencl", name="test_compute_at")


chrisn@chrisn-dt:~/dev/tech/tvm-clj/python/questions$ python3 compute_at_gpu.py

// attr [kern_vec] storage_scope = "global"
allocate kern_vec[float32 * 1 * (min(max((chans + -1), 0), 5) + 1)]
produce answer {
  for (y, 0, rows) {
    for (x, 0, cols) {
      produce kern_vec {
        for (cc, 0, (min(max((chans + -1), 0), 5) + 1)) {
          if (likely((cc < chans))) {
            kern_vec[cc] = (float32(x)*float32(cc))
          }
        }
      }
      for (c, 0, chans) {
        answer[((((y*cols) + x)*chans) + c)] = (placeholder[((((y*cols) + x)*chans) + c)]*kern_vec[min(max(c, 0), 5)])
      }
    }
  }
}

[13:19:31] /home/chrisn/dev/tech/tvm-clj/tvm/src/pass/loop_partition.cc:351: Cannot prove: (((((0 + (1 + min(max((chans - 1), 0), 5))) - 1) - ((chans - 1) + 1)) + 1) >= 0), when generating the post doubt loop
Traceback (most recent call last):
  File "compute_at_gpu.py", line 37, in <module>
    fun = tvm.build(sched, arglist, "opencl", name="test_compute_at")
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/build_module.py", line 519, in build
    mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/codegen.py", line 20, in build_module
    return _Build(lowered_func, target)
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/_ffi/_ctypes/function.py", line 185, in __call__
    ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
  File "/home/chrisn/.local/lib/python3.6/site-packages/tvm-0.5.dev0-py3.6-linux-x86_64.egg/tvm/_ffi/base.py", line 66, in check_call
    raise TVMError(py_str(_LIB.TVMGetLastError()))
tvm._ffi.base.TVMError: [13:19:31] /home/chrisn/dev/tech/tvm-clj/tvm/src/codegen/codegen_c.cc:725: Check failed: constant_size > 0 (0 vs. 0) Can only handle constant size stack allocation for now

The work around would be to engineer the calling code to only produce fixed kernel sizes which is doable in the actual use case.

I am satisfied with where this ended up. Thanks for your help and your patience!