How to use tensorize

yzhliu · July 4, 2018, 12:24am

gist.github.com

https://gist.github.com/yzhliu/7a694a99be7f11ebfbf2d634111b6175

conv2d_tensorize.py

import tvm

def compute_conv2d(A, W, stride, padding):
    batch_size, in_channel, height, width = A.shape
    out_channel, _ = W.shape

    kh = 1
    kw = 1

    out_height = (height + 2 * padding - kh) // stride + 1

This file has been truncated. show original

The script above failed with

tvm._ffi.base.TVMError: [17:13:55] /Users/yizhiliu/Workspace/tvm/src/op/tensorize.cc:318: Check failed: Equal(lhs, rhs) Failed to match the compute with TensorIntrin Matmul's declaration  provided= reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f]), source=[(placeholder(j, k)*placeholder(0, k))], axis=[iter_var(k, Range(min=0, extent=128))], where=(uint1)1, value_index=0), intrin=  reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f]), source=[(placeholder(i, k)*placeholder(j, k))], axis=[iter_var(k, Range(min=0, extent=128))], where=(uint1)1, value_index=0)

Stack trace returned 10 entries:
[bt] (0) 0   libtvm.dylib                        0x0000000106598afb dmlc::StackTrace() + 299
[bt] (1) 1   libtvm.dylib                        0x000000010659888f dmlc::LogMessageFatal::~LogMessageFatal() + 47
[bt] (2) 2   libtvm.dylib                        0x00000001067b3acc tvm::VerifyTensorizeBody(tvm::ComputeOpNode const*, tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, std::__1::unordered_map<tvm::Tensor, tvm::Array<tvm::Range, void>, std::__1::hash<tvm::Tensor>, std::__1::equal_to<tvm::Tensor>, std::__1::allocator<std::__1::pair<tvm::Tensor const, tvm::Array<tvm::Range, void> > > > const&, tvm::TensorIntrin const&) + 2396
[bt] (3) 3   libtvm.dylib                        0x00000001067b4d70 tvm::MakeTensorize(tvm::ComputeOpNode const*, tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, bool) + 544
[bt] (4) 4   libtvm.dylib                        0x0000000106795e5c tvm::ComputeOpNode::BuildProvide(tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, bool) const + 348
[bt] (5) 5   libtvm.dylib                        0x00000001067f8a5e tvm::schedule::MakePipeline(tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, HalideIR::Internal::Stmt, bool) + 62
[bt] (6) 6   libtvm.dylib                        0x00000001067f9e73 tvm::schedule::ScheduleOps(tvm::Schedule, tvm::Map<tvm::IterVar, tvm::Range, void, void>, bool) + 3027
[bt] (7) 7   libtvm.dylib                        0x00000001065d613c std::__1::__function::__func<tvm::schedule::$_2, std::__1::allocator<tvm::schedule::$_2>, void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&) + 204
[bt] (8) 8   libtvm.dylib                        0x0000000106972eb8 TVMFuncCall + 72
[bt] (9) 9   libffi.6.dylib                      0x00000001056d9884 ffi_call_unix64 + 76

if I change https://gist.github.com/yzhliu/7a694a99be7f11ebfbf2d634111b6175#file-conv2d_tensorize-py-L33-L34 to

out = tvm.compute((16,),
                      lambda i: tvm.sum(inp(i, k) * wgt(0, k), axis=[k]))

it fails with

tvm._ffi.base.TVMError: [17:21:16] /Users/yizhiliu/Workspace/tvm/src/pass/arg_binder.cc:93: Check failed: is_zero(value->elem_offset) Trying to bind a Buffer with offset into one without offset

Stack trace returned 10 entries:
[bt] (0) 0   libtvm.dylib                        0x00000001018acafb dmlc::StackTrace() + 299
[bt] (1) 1   libtvm.dylib                        0x00000001018ac88f dmlc::LogMessageFatal::~LogMessageFatal() + 47
[bt] (2) 2   libtvm.dylib                        0x00000001019beaca tvm::ir::ArgBinder::BindBuffer(tvm::Buffer const&, tvm::Buffer const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool) + 970
[bt] (3) 3   libtvm.dylib                        0x0000000101a7e958 tvm::ir::StorageFlattener::HandleBufferBindScope(HalideIR::Internal::AttrStmt const*) + 4568
[bt] (4) 4   libtvm.dylib                        0x0000000101a774f4 tvm::ir::StorageFlattener::Mutate_(HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&) + 852
[bt] (5) 5   libtvm.dylib                        0x0000000101a06625 std::__1::__function::__func<tvm::ir::$_1, std::__1::allocator<tvm::ir::$_1>, HalideIR::Internal::Stmt (HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(HalideIR::Internal::AttrStmt const*&&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*&&) + 21
[bt] (6) 6   libtvm.dylib                        0x0000000101a05c04 std::__1::__function::__func<tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>& tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::set_dispatch<HalideIR::Internal::AttrStmt>(std::__1::function<HalideIR::Internal::Stmt (HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>)::'lambda'(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*), std::__1::allocator<tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>& tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::set_dispatch<HalideIR::Internal::AttrStmt>(std::__1::function<HalideIR::Internal::Stmt (HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>)::'lambda'(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>, HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*&&) + 52
[bt] (7) 7   libtvm.dylib                        0x00000001018f5ea9 tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*) const + 377
[bt] (8) 8   libtvm.dylib                        0x00000001019c5bb6 tvm::ir::IRMutator::Mutate(HalideIR::Internal::Stmt) + 102
[bt] (9) 9   libtvm.dylib                        0x00000001019f8c2b tvm::ir::IRMutator::Mutate_(HalideIR::Internal::For const*, HalideIR::Internal::Stmt const&) + 235

I really get confused with how to make tensorize declaration match the original computing loop. Do we have any tutorials for tensorization?

xqdan · July 4, 2018, 3:03am

github.com

dmlc/tvm/blob/master/tests/python/unittest/test_schedule_tensorize.py

import tvm

def intrin_vadd(n):
    x = tvm.placeholder((n,), name='vx')
    y = tvm.placeholder((n,), name='vy')
    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
    def intrin_func(ins, outs):
        xx, yy = ins
        zz = outs[0]
        return tvm.call_packed("vadd", xx, yy, zz)
    with tvm.build_config(offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func)

def intrin_gemv(m, n):
    w = tvm.placeholder((m, n), name='w')
    x = tvm.placeholder((n,), name='x')
    k = tvm.reduce_axis((0, n), name='k')
    z = tvm.compute((m,), lambda i:
                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
    Wb = tvm.decl_buffer(w.shape, w.dtype,

This file has been truncated. show original

There are two cases here.
1, describe the compute logic, we can call it original compute
2, you need to do some schedule (split) to expose the axis you wanna tensorize, and mark it with tensorize api
3, you need to describe intrinsic pattern, which includes two parts, one is a clone of original compute, another is the intrinsic pattern you wanna use
4, after that, in scheduleOps, tensorize will try to do pattern match for original compute and clone compute, if success, will replace the marked axis with intrinsic.

yzhliu · July 4, 2018, 5:17pm

Thanks @xqdan
In my example, I think I missed with tvm.build_config(offset_factor=1): before https://gist.github.com/yzhliu/7a694a99be7f11ebfbf2d634111b6175#file-conv2d_tensorize-py-L51, which is required since the tensorized buffer offset is not zero.

taknevski · August 9, 2018, 11:46am

Hi,

I don’t understand what this offset_factor=1 for build_config means… Why is it 1 and not 0?

yzhliu · September 20, 2018, 4:26am

By default decl_tensor_intrin set offset_factor=0, which means the elem_offset of buffer will be ignored. Then once you try to bind it to an intermediate computing tensor, whose elem_offset is not zero, it will fail.

But honestly I don’t know in what case do we need to ignore the offset. @ziheng @tqchen

Ref:

github.com

dmlc/tvm/blob/master/include/tvm/build_module.h#L183-L186


 * \brief The offset factor to use when constructing buffers. If this is set to
 * 0, then the offset field is not used.
 */
int offset_factor = 0;

github.com

dmlc/tvm/blob/master/src/codegen/build_module.cc#L312


                               Type dtype,
                               std::string name,
                               int data_alignment,
                               int offset_factor) {
auto data = Var(name, Handle());


Expr elem_offset;
if (offset_factor != 0) {
  elem_offset = Var(name + "_elem_offset", shape[0].type());
} else {
  elem_offset = Expr();
}


return BufferNode::make(data, dtype, shape, Array<Expr>(), elem_offset, name, "",
  data_alignment, offset_factor);
}


void GetBinds(const Array<Tensor>& args,
            const std::unordered_map<Tensor, Buffer>& binds,
            Map<Tensor, Buffer>* out_binds,
            Array<NodeRef>* out_arg_list,

github.com

dmlc/tvm/blob/master/src/pass/arg_binder.cc#L92-L95


if (is_zero(arg->elem_offset)) {
  CHECK(is_zero(value->elem_offset))
      << "Trying to bind a Buffer with offset into one without offset";
}

tqchen · September 20, 2018, 3:56pm

The main reason is that we want user to give us as much information. No elem offset indicate that the data directly starts from data pointer, which by default aligns to 256 bytes according to dlpack standard. This enables optimizations such as aligned load. Alternatively, specifying elem factor would also help with that, but it also need the underlying code generator be aware of elem offset is multiple of offset factor, which some part of the code are not yet aware of that

tqchen · September 20, 2018, 4:11pm

currently there is a discussion in the note block https://docs.tvm.ai/api/python/tvm.html#tvm.decl_buffer, please also suggest if there is anything we can do it improve it

yzhliu · September 21, 2018, 7:29am

I thought data_alignment is for alignment. So data_alignment is for loading but offset_factor is for allocation? And I see for now offset_factor is not “really” used, only generates several checks elem_offset % offset_factor == 0?