Other Features¶
Author: Hongzheng Chen (
This document will discuss other features that are not covered in the previous tutorials.
Dynamic Shapes¶
In some cases, the shape of the tensor is not known at compile time, so we can use [...]
to represent the dynamic shape.
From the generated MLIR module, we can see it has a "?"
in the shape of the tensor, which means the shape is not predefined,
but we can still run the LLVM module with arbitrary shapes of NumPy arrays.
import allo
from import int32, float32
import numpy as np
def kernel(A: float32[...], B: float32[...], size: int32):
for i in range(size):
B[i] = A[i]
s = allo.customize(kernel)
np_A = np.random.random((256,)).astype(np.float32)
allo_A = np.zeros((256,)).astype(np.float32)
mod =
mod(np_A, allo_A, 256)
np.testing.assert_allclose(np_A, allo_A)
module {
func.func @kernel(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: i32) attributes {itypes = "__s", otypes = ""} {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %arg2 : i32 to index
%c1_i32 = arith.constant 1 : i32
%2 = arith.index_cast %c1_i32 : i32 to index
scf.for %arg3 = %0 to %1 step %2 {
%3 = memref.load %arg0[%arg3] {from = "A"} : memref<?xf32> %3, %arg1[%arg3] {to = "B"} : memref<?xf32>
} {loop_name = "i", op_name = "S_i_0"}
We can also check the generated HLS code that the arguments are declared as pointers.
code ="vhls")
//===------------------------------------------------------------*- C++ -*-===//
// Automatically generated file for High-level Synthesis (HLS).
#include <algorithm>
#include <ap_axi_sdata.h>
#include <ap_fixed.h>
#include <ap_int.h>
#include <hls_math.h>
#include <hls_stream.h>
#include <math.h>
#include <stdint.h>
using namespace std;
void kernel(
float *v0,
float *v1,
int32_t v2
) { // L2
int v3 = v2; // L5
for (int v4 = 0; v4 < v3; v4 += 1) { // L8
float v5 = *v0[v4]; // L9
*v1[v4] = v5; // L10
Tuple Return¶
Another feature is the tuple support. As in Python, we can return multiple values from a function, Allo also supports this by explicitly specifying the return type as a tuple.
def callee(a: float32, b: float32) -> (float32, float32):
c: float32 = a + b
d: float32 = a - b
return c, d
def kernel(A: float32[10], B: float32[10]) -> (float32[10], float32[10]):
C: float32[10] = 0
D: float32[10] = 0
for i in range(10):
C[i], D[i] = callee(A[i], B[i])
return C, D
s = allo.customize(kernel)
mod =
np_A = np.random.random((10,)).astype(np.float32)
np_B = np.random.random((10,)).astype(np.float32)
np_C, np_D = mod(np_A, np_B)
np_C_ref = np.zeros((10,), dtype=np.float32)
np_D_ref = np.zeros((10,), dtype=np.float32)
for i in range(10):
np_C_ref[i], np_D_ref[i] = callee(np_A[i], np_B[i])
np.testing.assert_allclose(np_C, np_C_ref)
np.testing.assert_allclose(np_D, np_D_ref)
module {
func.func @callee(%arg0: f32, %arg1: f32) -> (f32, f32) attributes {itypes = "__", otypes = "__"} {
%0 = arith.addf %arg0, %arg1 : f32
%alloc = memref.alloc() {name = "c"} : memref<f32> %0, %alloc[] {to = "c"} : memref<f32>
%1 = arith.subf %arg0, %arg1 : f32
%alloc_0 = memref.alloc() {name = "d"} : memref<f32> %1, %alloc_0[] {to = "d"} : memref<f32>
%2 = affine.load %alloc[] {from = "c"} : memref<f32>
%3 = affine.load %alloc_0[] {from = "d"} : memref<f32>
return %2, %3 : f32, f32
func.func @kernel(%arg0: memref<10xf32>, %arg1: memref<10xf32>) -> (memref<10xf32>, memref<10xf32>) attributes {itypes = "__", otypes = "__"} {
%c0_i32 = arith.constant 0 : i32
%0 = arith.sitofp %c0_i32 : i32 to f32
%alloc = memref.alloc() {name = "C"} : memref<10xf32>
linalg.fill ins(%0 : f32) outs(%alloc : memref<10xf32>)
%c0_i32_0 = arith.constant 0 : i32
%1 = arith.sitofp %c0_i32_0 : i32 to f32
%alloc_1 = memref.alloc() {name = "D"} : memref<10xf32>
linalg.fill ins(%1 : f32) outs(%alloc_1 : memref<10xf32>)
affine.for %arg2 = 0 to 10 {
%2 = affine.load %arg0[%arg2] {from = "A"} : memref<10xf32>
%3 = affine.load %arg1[%arg2] {from = "B"} : memref<10xf32>
%4:2 = @callee(%2, %3) : (f32, f32) -> (f32, f32) %4#0, %alloc[%arg2] {to = "C"} : memref<10xf32> %4#1, %alloc_1[%arg2] {to = "D"} : memref<10xf32>
} {loop_name = "i", op_name = "S_i_0"}
return %alloc, %alloc_1 : memref<10xf32>, memref<10xf32>
