Custom Module Definition

Author: Yi-Hsiang Lai (seanlatias@github.com)

In this tutorial, we will introduce a new API called module, which allows users to define a hardware module.

import heterocl as hcl
import numpy as np

Defining a Hardware Module

It is important for users to define a hardware module. The main reason is that by reusing the defined hardware module, we can reduce the resource usage of the design. To define a module, what we need to do is to define a Python function. Then, apply the function with a decorator. Within the decorator, we need to specify the shapes of the arguments. Following we show an example of defining a hardware module that return the maximum value of two tensors with a given index.

Note that in this example, we have three input arguments, which are A, B, and x. The first two arguments are tensors with shape (10,) while the last argument is a variable. To represent the shape of a variable, we use an empty tuple ().

Another thing to be noted is that we use hcl.return_ for the return value. We can see that we can have multiple return statements.

Use the Defined Module

To use the module, it is just like a normal Python call. There is nothing special here. Following we show an example of finding the element-wise maximum value of four tensors.

hcl.init()


def maximum(A, B, C, D):
    # Due to MLIR limitation, hcl.return_ must be the
    # last statement in the module (i.e. region terminator)
    # TODO(Niansong): add checker pass for this case
    # @hcl.def_([A.shape, B.shape, ()])
    # def find_max(A, B, x):
    #     with hcl.if_(A[x] > B[x]):
    #         hcl.return_(A[x])
    #     with hcl.else_():
    #         hcl.return_(B[x])

    @hcl.def_([A.shape, B.shape, ()])
    def find_max(A, B, x):
        ret = hcl.scalar(0, "ret")
        with hcl.if_(A[x] > B[x]):
            ret.v = A[x]
        with hcl.else_():
            ret.v = B[x]
        hcl.return_(ret.v)

    max_1 = hcl.compute(A.shape, lambda x: find_max(A, B, x), "max_1")
    max_2 = hcl.compute(A.shape, lambda x: find_max(C, D, x), "max_2")
    return hcl.compute(A.shape, lambda x: find_max(max_1, max_2, x), "max_o")

We can first inspect the generated IR. You can see that for each computation, we reuse the same module to find the maximum.

A = hcl.placeholder((10,), "A")
B = hcl.placeholder((10,), "B")
C = hcl.placeholder((10,), "C")
D = hcl.placeholder((10,), "D")

s = hcl.create_schedule([A, B, C, D], maximum)
print(s.ast)
print(hcl.lower(s))
/__w/heterocl/heterocl/hcl-dialect/build/tools/hcl/python_packages/hcl_core/hcl_mlir/exceptions.py:75: DeprecationWarning:
[Deprecation] hcl.def_() is deprecated, please use .outline() instead.
  warnings.warn(self.message, category=self.category)
func top(['A', 'B', 'C', 'D']) {
  func find_max(['A', 'B', 'x']) {
      ret = compute((1,), Int(32)) (x_0) {
        ret[x_0] = 0
      }

      if (A[x] gt B[x]) {
        ret[0] = A[x]
      }
      else {
        ret[0] = B[x]
      }
    return ['getitem']
  }

  max_1 = compute((10,), Int(32)) (x) {
    max_1[x] = ['getitem'] = call find_max(['A', 'B', 'x'])
  }

  max_2 = compute((10,), Int(32)) (x_1) {
    max_2[x_1] = ['getitem'] = call find_max(['C', 'D', 'x_1'])
  }

  max_o = compute((10,), Int(32)) (x_2) {
    max_o[x_2] = ['getitem'] = call find_max(['max_1', 'max_2', 'x_2'])
  }

  return ['max_o']
}

module {
  func.func @find_max(%arg0: memref<10xi32>, %arg1: memref<10xi32>, %arg2: index) -> i32 attributes {itypes = "ssu", otypes = "s"} {
    %c0 = arith.constant 0 : index
    %0 = memref.alloc() {name = "ret"} : memref<1xi32>
    %c0_i32 = arith.constant 0 : i32
    affine.store %c0_i32, %0[%c0] {to = "ret"} : memref<1xi32>
    %1 = memref.load %arg0[%arg2] {from = "A"} : memref<10xi32>
    %2 = memref.load %arg1[%arg2] {from = "B"} : memref<10xi32>
    %3 = arith.cmpi sgt, %1, %2 : i32
    scf.if %3 {
      %5 = memref.load %arg0[%arg2] {from = "A"} : memref<10xi32>
      affine.store %5, %0[0] {to = "ret"} : memref<1xi32>
    } else {
      %5 = memref.load %arg1[%arg2] {from = "B"} : memref<10xi32>
      affine.store %5, %0[0] {to = "ret"} : memref<1xi32>
    }
    %4 = affine.load %0[0] {from = "ret"} : memref<1xi32>
    return %4 : i32
  }
  func.func @top(%arg0: memref<10xi32>, %arg1: memref<10xi32>, %arg2: memref<10xi32>, %arg3: memref<10xi32>) -> memref<10xi32> attributes {itypes = "ssss", otypes = "s"} {
    %0 = memref.alloc() {name = "max_1"} : memref<10xi32>
    affine.for %arg4 = 0 to 10 {
      %3 = func.call @find_max(%arg0, %arg1, %arg4) : (memref<10xi32>, memref<10xi32>, index) -> i32
      affine.store %3, %0[%arg4] {to = "max_1"} : memref<10xi32>
    } {loop_name = "x", op_name = "max_1"}
    %1 = memref.alloc() {name = "max_2"} : memref<10xi32>
    affine.for %arg4 = 0 to 10 {
      %3 = func.call @find_max(%arg2, %arg3, %arg4) : (memref<10xi32>, memref<10xi32>, index) -> i32
      affine.store %3, %1[%arg4] {to = "max_2"} : memref<10xi32>
    } {loop_name = "x_1", op_name = "max_2"}
    %2 = memref.alloc() {name = "max_o"} : memref<10xi32>
    affine.for %arg4 = 0 to 10 {
      %3 = func.call @find_max(%0, %1, %arg4) : (memref<10xi32>, memref<10xi32>, index) -> i32
      affine.store %3, %2[%arg4] {to = "max_o"} : memref<10xi32>
    } {loop_name = "x_2", op_name = "max_o"}
    return %2 : memref<10xi32>
  }
}

Finally, let’s run the algorithm and check the results

f = hcl.build(s)

a = np.random.randint(100, size=(10,))
b = np.random.randint(100, size=(10,))
c = np.random.randint(100, size=(10,))
d = np.random.randint(100, size=(10,))
o = np.zeros(10)

hcl_A = hcl.asarray(a)
hcl_B = hcl.asarray(b)
hcl_C = hcl.asarray(c)
hcl_D = hcl.asarray(d)
hcl_O = hcl.asarray(o, dtype=hcl.Int())

f(hcl_A, hcl_B, hcl_C, hcl_D, hcl_O)

print("Input tensors:")
print(hcl_A)
print(hcl_B)
print(hcl_C)
print(hcl_D)
print("Output tensor:")
print(hcl_O)

# Test the correctness
m1 = np.maximum(a, b)
m2 = np.maximum(c, d)
m = np.maximum(m1, m2)
assert np.array_equal(hcl_O.asnumpy(), m)
Input tensors:
array([ 4, 22, 59, 10, 80, 83, 77, 53, 39, 49])
array([70,  5, 93, 12, 51, 64, 20, 21, 87, 62])
array([18, 13, 44, 52, 12, 30, 28, 63, 16, 43])
array([23, 98, 81, 48,  8, 34, 32, 76, 26, 20])
Output tensor:
array([70, 98, 93, 52, 80, 83, 77, 76, 87, 62])

Modules Without Return Statement

HeteroCL also allows users to define modules without a return statement. The usage is exactly the same as what we just introduced. The only differece is that the module can be called in a stand-alone way. Namely, it does not need to be contained in any HeteroCL APIs. Let’s use the same example of finding the maximum. However, this time we update the output directly.

hcl.init()


def maximum2(A, B, C, D):
    # B will be the tensor that holds the maximum values
    @hcl.def_([A.shape, B.shape])
    def find_max(A, B):
        with hcl.for_(0, A.shape[0]) as i:
            with hcl.if_(A[i] > B[i]):
                B[i] = A[i]

    find_max(A, B)
    find_max(C, D)
    find_max(B, D)


s = hcl.create_schedule([A, B, C, D], maximum2)
f = hcl.build(s)
/__w/heterocl/heterocl/hcl-dialect/build/tools/hcl/python_packages/hcl_core/hcl_mlir/exceptions.py:75: DeprecationWarning:
[Deprecation] hcl.def_() is deprecated, please use .outline() instead.
  warnings.warn(self.message, category=self.category)

In the above example, we can see that now without the return value, we can directly call the defined module. Let’s check the results. They should be the same as our first example.

f(hcl_A, hcl_B, hcl_C, hcl_D)

print("Output tensor:")
print(hcl_D)

# Test the correctness
m1 = np.maximum(a, b)
m2 = np.maximum(c, d)
m = np.maximum(m1, m2)
assert np.array_equal(hcl_D.asnumpy(), m)
Output tensor:
array([70, 98, 93, 52, 80, 83, 77, 76, 87, 62])

Data Type Customization for Modules

We can also apply data type customization to our defined modules. There are two ways to do that. First, you can specify the data types directly in the module decorator. Second, you can use the quantize and downsize APIs. Let’s show how we can downsize the first example.

A = hcl.placeholder((10,), dtype=hcl.UInt(4))
B = hcl.placeholder((10,), dtype=hcl.UInt(4))
C = hcl.placeholder((10,), dtype=hcl.UInt(4))
D = hcl.placeholder((10,), dtype=hcl.UInt(4))

s = hcl.create_scheme([A, B, C, D], maximum)
# We also need to downsize the intermediate results
s.downsize([maximum.max_1, maximum.max_2], hcl.UInt(4))
s = hcl.create_schedule_from_scheme(s)
f = hcl.build(s)
/__w/heterocl/heterocl/hcl-dialect/build/tools/hcl/python_packages/hcl_core/hcl_mlir/exceptions.py:75: DeprecationWarning:
[Deprecation] hcl.def_() is deprecated, please use .outline() instead.
  warnings.warn(self.message, category=self.category)

Let’s run it.

hcl_A = hcl.asarray(a, hcl.UInt(4))
hcl_B = hcl.asarray(b, hcl.UInt(4))
hcl_C = hcl.asarray(c, hcl.UInt(4))
hcl_D = hcl.asarray(d, hcl.UInt(4))
hcl_O = hcl.asarray(o)

f(hcl_A, hcl_B, hcl_C, hcl_D, hcl_O)

print("Downsized output tensor:")
print(hcl_O)
Downsized output tensor:
array([ 7, 13, 13, 12, 12, 14, 13, 15, 10, 14])

We can see that the results are downsized to 4-bit numbers. We can double check this.

# Test the correctness
m1 = np.maximum(a % 16, b % 16)
m2 = np.maximum(c % 16, d % 16)
m = np.maximum(m1 % 16, m2 % 16)
assert np.array_equal(hcl_O.asnumpy(), m)

Total running time of the script: ( 0 minutes 0.221 seconds)

Gallery generated by Sphinx-Gallery