import tvm
from tvm.ir.module import IRModule
from tvm.script import tir as T
@T.prim_func
def main(A: T.Buffer[(128,), "float32"],
B: T.Buffer[(128,), "float32"],
C: T.Buffer[(128,), "float32"]):
T.func_attr({"global_symbol": "main", "tir.noalias": True})
for i in range(128):
with T.block("C"):
vi = T.axis.spatial(128, i)
C[vi] = A[vi] + B[vi]
mod = tvm.IRModule.from_expr(main)
print(mod["main"].script())
Can I provide an external definition fo the kernel:
import tvm
from tvm.ir.module import IRModule
from tvm.script import tir as T
def foo(A, B, C):
# Convert A,B,C to numpy.ndarray or something?
__ my_random_kernel(A, B, C)
@T.prim_func
def main(A: T.Buffer[(128,), "float32"],
B: T.Buffer[(128,), "float32"],
C: T.Buffer[(128,), "float32"]):
T.func_attr({"global_symbol": "main", "tir.noalias": True})
foo(A, B, C)
import tvm
from tvm.ir.module import IRModule
from tvm.script import tir as T
import numpy as np
def foo(A):
A[0] = 2.0
@T.prim_func
def main(a: T.handle):
T.func_attr({"global_symbol": "main", "tir.noalias": True})
A = T.match_buffer(a, (5,), "float32")
tvm.tir.call_packed("foo", A)
if __name__ == "__main__":
mod = IRModule.from_expr(main)
f = tvm.build(mod, target="llvm")
a = tvm.nd.array(np.array([1, 2, 3, 4, 5], dtype=np.float32), device=tvm.cpu())
f(a)
print(a.numpy())
But got the error:
error: Internal Error: function must be of type Intrin, ScopeHandler or SpecialStmt, but it is function
|
13 | tvm.tir.call_packed("foo", A)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Inlining foo manually instead of using tvm.tir.call_packed works.
Am I using call_packed incorrectly? Do I need to convert A to some other type before passing it?
we can not support call_packed in this case because foo is a python function, packed function needs to be registered as global where you can invoke as string
I was able to invoke an external function using Relax:
import tvm
from tvm.script import relax as R
from tvm.script import ir as I
import numpy as np
@tvm.register_func("foo", override=True)
def foo(A, out):
out = A.numpy()
print(out)
@I.ir_module
class Module:
@R.function
def main(a: R.Tensor(("n",), dtype="float32")):
with R.dataflow():
out = R.call_dps_packed("foo", (a), out_sinfo=R.Tensor((n,), dtype="float32"))
R.output(out)
return out
if __name__ == "__main__":
ex = tvm.relax.build(Module, target="llvm")
vm = tvm.relax.VirtualMachine(ex, tvm.cpu())
a = tvm.nd.array(np.array([1, 2, 3, 4, 5], dtype=np.float32), device=tvm.cpu())
out = vm["main"](a)
print(out)
However, the print() from inside foo prints [1, 2, 3, 4, 5] (as expected), whereas the print() from __main__ prints something along the lines of [8.9683102e-44 0.0000000e+00 6.7262326e-44 0.0000000e+00 4.8746746e-35].
It feels like I’m somehow not handling the datatype packing correctly.
Hi @reikdas, your foo doesn’t make any effective change to out, since the line of out = A.numpy() is pointing the variable out to the object of A.numpy(), rather than in-place mutating the input NDArray out.
There are two ways to make it properly work. The first way is to use out.copyfrom(A), in which way there will be a cuda memory copy from A to out:
The second approach is to use call_pure_packed, in which way your customized foo needs to handle memory allocation for output. (Or like the example below, foo just returns A instead of allocating new NDArray, where the return value shares the same memory as A).
import numpy as np
import tvm
from tvm.script import ir as I
from tvm.script import relax as R
from tvm.script import tir as T
@tvm.register_func("foo", override=True)
def foo(A):
return A
@I.ir_module
class Module:
@R.function
def main(a: R.Tensor(("n",), dtype="float32")):
with R.dataflow():
n = T.int64()
out = R.call_pure_packed("foo", a, sinfo_args=R.Tensor((n,), dtype="float32"))
R.output(out)
return out
if __name__ == "__main__":
ex = tvm.relax.build(Module, target="llvm")
vm = tvm.relax.VirtualMachine(ex, tvm.cpu())
a = tvm.nd.array(np.array([1, 2, 3, 4, 5], dtype=np.float32), device=tvm.cpu())
out = vm["main"](a)
print(out)