Standalone Sumatra Stream API Offload Demo

Here is a very simple example of using IntStream.range.forEach() that can be offloaded to HSA.

First you need to build the Sumatra/Graal JDK as shown at Sumatra JDK build instructions. Note at this time the Sumatra JDK is only for Linux and all our testing is with Ubuntu 12.04 or later.

Here is our example code. It actually has 2 offloaded lambdas, one to initialize the inputs and one to do the sum:

package simple;

import java.util.stream.IntStream;

public class Simple {

    public static void main(String[] args) {
        final int length = 8;
        int[] ina = new int[length];
        int[] inb = new int[length];
        int[] out = new int[length];

        // Initialize the input arrays - this is offloadable
        IntStream.range(0, length).parallel().forEach(p -> {
            ina[p] = 1;
            inb[p] = 2;
        });

        // Sum each pair of elements into out[] - this is offloadable
        IntStream.range(0, length).parallel().forEach(p -> {
            out[p] = ina[p] + inb[p];
        });

        // Print results - this is not offloadable since it is
        // calling native code etc.
        IntStream.range(0, length).forEach(p -> {
            System.out.println(out[p] + ", " + ina[p] + ", " + inb[p]);
        });
    }
}

Set the JAVA_HOME to the Graal JDK you built:

$ export JAVA_HOME=/path/to/graal/jdk1.8.0-internal/product/

Build the java code above. To run the demo, you need to use the proper version of OKRA depending if you are using the HSA simulator or HSA capable APU. The simulator is explained here: https://wiki.openjdk.java.net/display/Sumatra/The+HSAIL+Simulator. The OKRA for APUs will be available at the HSA foundation github website, see https://github.com/HSAFoundation. When you have built or downloaded the proper OKRA, add the path to the OKRA binaries to your LD_LIBRARY_PATH and PATH:

$ export PATH=$PATH:/path/to/okra/dist/bin
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/okra/dist/bin

Now you can run the example with and without offloading enabled. Note we are using -server for this work, so that the CPU code gets compiled with the regular server compiler and HSAIL is compiled with Graal. The option -Dcom.amd.sumatra.offload.immediate=true|false allows switching Stream API offload on and off. It is false by default and this is an otherwise normal JDK.

$ $JAVA_HOME/bin/java -server -esa -XX:+TraceGPUInteraction -Dcom.amd.sumatra.offload.immediate=true -G:Log=CodeGen simple.Simple
...
[HSAIL] library is libokra_x86_64.so
[GPU] registered initialization of Okra (total initialized: 2)
[CUDA] Ptx::get_execute_kernel_from_vm_address
[thread:1] scope:
[thread:1] scope: GraalCompiler
    [thread:1] scope: GraalCompiler.CodeGen
    Nothing to do here
    Nothing to do here
    Nothing to do here
    version 0:95: $full : $large;
// static method HotSpotMethod<Simple.lambda$main$0(int[], int[], int)>
kernel &run (
   align 8 kernarg_u64 %_arg0,
   align 8 kernarg_u64 %_arg1
   ) {
   ld_kernarg_u64 $d0, [%_arg0];
   ld_kernarg_u64 $d1, [%_arg1];
   workitemabsid_u32 $s0, 0;

@L0:
   cmp_eq_b1_u64 $c0, $d1, 0; // null test
   cbr $c0, @L1;
@L2:
   ld_global_s32 $s1, [$d1 + 12];
   cmp_ge_b1_u32 $c0, $s0, $s1;
   cbr $c0, @L8;
@L3:
   cmp_eq_b1_u64 $c0, $d0, 0; // null test
   cbr $c0, @L4;
@L5:
   ld_global_s32 $s1, [$d0 + 12];
   cmp_ge_b1_u32 $c0, $s0, $s1;
   cbr $c0, @L7;
@L6:
   cvt_s64_s32 $d2, $s0;
   mul_s64 $d2, $d2, 4;
   add_u64 $d0, $d0, $d2;
   st_global_s32 1, [$d0 + 16];
   cvt_s64_s32 $d0, $s0;
   mul_s64 $d0, $d0, 4;
   add_u64 $d1, $d1, $d0;
   st_global_s32 2, [$d1 + 16];
   ret;
@L1:
   mov_b32 $s0, -6155;
@L9:
   ret;
@L4:
   mov_b32 $s0, -4363;
   brn @L9;
@L8:
   mov_b32 $s0, -6683;
   brn @L9;
@L7:
   mov_b32 $s0, -4891;
   brn @L9;
};

[HSAIL] heap=0x00007f4c9801de38
[HSAIL] base=0x05a00000, capacity=209190912
External method:simple.Simple.lambda$main$0([I[II)V
installCode0: ExternalCompilationResult
[HSAIL] sig:([I[II)V args length=2, _parameter_count=3
[HSAIL] static method
[HSAIL] HSAILKernelArguments::do_array, _index=0, 0x82b20888, is a [I
[HSAIL] HSAILKernelArguments::do_array, _index=1, 0x82b208b8, is a [I
[HSAIL] HSAILKernelArguments::not pushing trailing int
[thread:1] scope: GraalCompiler
    [thread:1] scope: GraalCompiler.CodeGen
    Nothing to do here
    Nothing to do here
    Nothing to do here
    version 0:95: $full : $large;
// static method HotSpotMethod<Simple.lambda$main$1(int[], int[], int[], int)>
kernel &run (
   align 8 kernarg_u64 %_arg0,
   align 8 kernarg_u64 %_arg1,
   align 8 kernarg_u64 %_arg2
   ) {
   ld_kernarg_u64 $d0, [%_arg0];
   ld_kernarg_u64 $d1, [%_arg1];
   ld_kernarg_u64 $d2, [%_arg2];
   workitemabsid_u32 $s0, 0;

@L0:
   cmp_eq_b1_u64 $c0, $d0, 0; // null test
   cbr $c0, @L1;
@L2:
   ld_global_s32 $s1, [$d0 + 12];
   cmp_ge_b1_u32 $c0, $s0, $s1;
   cbr $c0, @L12;
@L3:
   cmp_eq_b1_u64 $c0, $d2, 0; // null test
   cbr $c0, @L4;
@L5:
   ld_global_s32 $s1, [$d2 + 12];
   cmp_ge_b1_u32 $c0, $s0, $s1;
   cbr $c0, @L11;
@L6:
   cmp_eq_b1_u64 $c0, $d1, 0; // null test
   cbr $c0, @L7;
@L8:
   ld_global_s32 $s1, [$d1 + 12];
   cmp_ge_b1_u32 $c0, $s0, $s1;
   cbr $c0, @L10;
@L9:
   cvt_s64_s32 $d3, $s0;
   mul_s64 $d3, $d3, 4;
   add_u64 $d1, $d1, $d3;
   ld_global_s32 $s1, [$d1 + 16];
   cvt_s64_s32 $d1, $s0;
   mul_s64 $d1, $d1, 4;
   add_u64 $d2, $d2, $d1;
   ld_global_s32 $s2, [$d2 + 16];
   add_s32 $s2, $s2, $s1;
   cvt_s64_s32 $d1, $s0;
   mul_s64 $d1, $d1, 4;
   add_u64 $d0, $d0, $d1;
   st_global_s32 $s2, [$d0 + 16];
   ret;
@L1:
   mov_b32 $s0, -7691;
@L13:
   ret;
@L4:
   mov_b32 $s0, -6411;
   brn @L13;
@L10:
   mov_b32 $s0, -5403;
   brn @L13;
@L7:
   mov_b32 $s0, -4875;
   brn @L13;
@L12:
   mov_b32 $s0, -8219;
   brn @L13;
@L11:
   mov_b32 $s0, -6939;
   brn @L13;
};

[HSAIL] heap=0x00007f4c9801de38
[HSAIL] base=0x05a00000, capacity=209190912
External method:simple.Simple.lambda$main$1([I[I[II)V
installCode0: ExternalCompilationResult
[HSAIL] sig:([I[I[II)V args length=3, _parameter_count=4
[HSAIL] static method
[HSAIL] HSAILKernelArguments::do_array, _index=0, 0x82b208f8, is a [I
[HSAIL] HSAILKernelArguments::do_array, _index=1, 0x82b20888, is a [I
[HSAIL] HSAILKernelArguments::do_array, _index=2, 0x82b208b8, is a [I
[HSAIL] HSAILKernelArguments::not pushing trailing int
3, 1, 2
3, 1, 2
3, 1, 2
3, 1, 2
3, 1, 2
3, 1, 2
3, 1, 2
3, 1, 2

The extra options -XX:+TraceGPUInteraction -G:Log=CodeGen allow to see the progress and the HSAIL code produced for the offloaded lambda.