Add Raja hello world and vector addition examples 41/5141/4
authorJaemin Choi <jchoi157@illinois.edu>
Sun, 28 Apr 2019 22:45:18 +0000 (18:45 -0400)
committerJaemin Choi <jchoi157@illinois.edu>
Tue, 7 May 2019 21:00:51 +0000 (16:00 -0500)
Change-Id: I192b68ff73d605ab6cbbb658a751edec37237bdd

13 files changed:
examples/charm++/shared_runtimes/raja/Makefile.common [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/hello/Makefile [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/hello/README [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/hello/hello.ci [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/hello/hello.h [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/hello/hello_charm.C [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/hello/hello_raja.cpp [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/vecadd/Makefile [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/vecadd/README [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/vecadd/vecadd.ci [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/vecadd/vecadd.h [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/vecadd/vecadd_charm.C [new file with mode: 0644]
examples/charm++/shared_runtimes/raja/vecadd/vecadd_raja.cpp [new file with mode: 0644]

diff --git a/examples/charm++/shared_runtimes/raja/Makefile.common b/examples/charm++/shared_runtimes/raja/Makefile.common
new file mode 100644 (file)
index 0000000..b189c74
--- /dev/null
@@ -0,0 +1,10 @@
+# Absolute path to Raja install folder should be set here.
+RAJA_INSTALL_PATH ?= ~/raja/install
+
+RAJA_INC = $(RAJA_INSTALL_PATH)/include
+RAJA_LIB = $(RAJA_INSTALL_PATH)/lib
+
+# GPU compute capability should be used here, 3.5 and above recommended
+NVCC_OPTS = -restrict -arch sm_35 --expt-extended-lambda -O3 -Xcompiler=-fPIE -Xcompiler=-fopenmp -std=c++11 -x cu
+
+CHARMC = ../../../../../bin/charmc $(OPTS)
diff --git a/examples/charm++/shared_runtimes/raja/hello/Makefile b/examples/charm++/shared_runtimes/raja/hello/Makefile
new file mode 100644 (file)
index 0000000..4319b63
--- /dev/null
@@ -0,0 +1,24 @@
+-include ../../../../common.mk
+-include ../Makefile.common
+
+TARGET = hello
+
+all: $(TARGET)
+
+$(TARGET): $(TARGET)_raja.o $(TARGET)_charm.o
+       $(CHARMC) -language charm++ -fopenmp -L$(RAJA_LIB) -lRAJA -L$(CUDATOOLKIT_HOME)/lib64 -lcudart -o $@ $^
+
+$(TARGET)_raja.o: $(TARGET)_raja.cpp $(TARGET).h
+       nvcc $(NVCC_OPTS) -I$(RAJA_INC) -c $<
+
+$(TARGET).decl.h: $(TARGET).ci
+       $(CHARMC) $<
+
+$(TARGET)_charm.o: $(TARGET)_charm.C $(TARGET).decl.h $(TARGET).h
+       $(CHARMC) -c $< -I$(RAJA_INC) -I$(CUDATOOLKIT_HOME)/include
+
+clean:
+       rm -f $(TARGET) *.decl.h *.def.h *.o charmrun
+
+test: all
+       $(call run, ./$(TARGET) +p2)
diff --git a/examples/charm++/shared_runtimes/raja/hello/README b/examples/charm++/shared_runtimes/raja/hello/README
new file mode 100644 (file)
index 0000000..22f3ee0
--- /dev/null
@@ -0,0 +1,14 @@
+Hello World with Raja
+
+Demonstrates basic interoperability with Raja, using Charm++ nodegroup
+to spawn a separate instance of Raja on each process.
+
+Requires Raja to be built with GPU support.
+e.g. From Raja source folder,
+> mkdir build && install
+> cd build
+> cmake -DENABLE_CUDA=On -DCMAKE_INSTALL_PREFIX=<path to RAJA install folder> ../
+> make -j
+> make install
+
+Path to Raja install folder and GPU compute capability should be set in Makefile.common.
diff --git a/examples/charm++/shared_runtimes/raja/hello/hello.ci b/examples/charm++/shared_runtimes/raja/hello/hello.ci
new file mode 100644 (file)
index 0000000..4bfd99b
--- /dev/null
@@ -0,0 +1,14 @@
+mainmodule hello {
+  readonly CProxy_Main mainProxy;
+  readonly CProxy_Hello helloProxy;
+
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+    entry [reductiontarget] void done();
+  };
+
+  nodegroup Hello {
+    entry Hello();
+    entry void run();
+  };
+};
diff --git a/examples/charm++/shared_runtimes/raja/hello/hello.h b/examples/charm++/shared_runtimes/raja/hello/hello.h
new file mode 100644 (file)
index 0000000..e6c5b1c
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef HELLO_H_
+#define HELLO_H_
+#include <cstdint>
+
+void hello(const uint64_t n, int process);
+
+#endif // HELLO_H_
diff --git a/examples/charm++/shared_runtimes/raja/hello/hello_charm.C b/examples/charm++/shared_runtimes/raja/hello/hello_charm.C
new file mode 100644 (file)
index 0000000..e2f9186
--- /dev/null
@@ -0,0 +1,41 @@
+#include "hello.decl.h"
+#include "hello.h"
+#include "pup_stl.h"
+#include <stdio.h>
+#include <vector>
+#include <string>
+#include <typeinfo>
+
+/* readonly */ CProxy_Main mainProxy;
+/* readonly */ CProxy_Hello helloProxy;
+
+class Main : public CBase_Main {
+public:
+  Main(CkArgMsg* m) {
+    // Create nodegroup and run
+    helloProxy = CProxy_Hello::ckNew();
+    helloProxy.run();
+  };
+
+  void done() {
+    CkPrintf("All done\n");
+
+    CkExit();
+  };
+};
+
+class Hello : public CBase_Hello {
+public:
+  Hello() {}
+
+  void run() {
+    // Parallel execution with Raja
+    hello(16, CmiMyNode());
+
+    // Reduce to Main to end the program
+    CkCallback cb(CkReductionTarget(Main, done), mainProxy);
+    contribute(cb);
+  }
+};
+
+#include "hello.def.h"
diff --git a/examples/charm++/shared_runtimes/raja/hello/hello_raja.cpp b/examples/charm++/shared_runtimes/raja/hello/hello_raja.cpp
new file mode 100644 (file)
index 0000000..6986128
--- /dev/null
@@ -0,0 +1,8 @@
+#include "hello.h"
+#include "RAJA/RAJA.hpp"
+
+void hello(const uint64_t n, int process) {
+  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, n), [=] (int i) {
+    printf("[Process %d] Hello from i = %d\n", process, i);
+  });
+}
diff --git a/examples/charm++/shared_runtimes/raja/vecadd/Makefile b/examples/charm++/shared_runtimes/raja/vecadd/Makefile
new file mode 100644 (file)
index 0000000..5f9ec84
--- /dev/null
@@ -0,0 +1,24 @@
+-include ../../../../common.mk
+-include ../Makefile.common
+
+TARGET = vecadd
+
+all: $(TARGET)
+
+$(TARGET): $(TARGET)_raja.o $(TARGET)_charm.o
+       $(CHARMC) -language charm++ -fopenmp -L$(RAJA_LIB) -lRAJA -L$(CUDATOOLKIT_HOME)/lib64 -lcudart -o $@ $^
+
+$(TARGET)_raja.o: $(TARGET)_raja.cpp $(TARGET).h
+       nvcc $(NVCC_OPTS) -I$(RAJA_INC) -c $<
+
+$(TARGET).decl.h: $(TARGET).ci
+       $(CHARMC) $<
+
+$(TARGET)_charm.o: $(TARGET)_charm.C $(TARGET).decl.h $(TARGET).h
+       $(CHARMC) -c $< -I$(RAJA_INC) -I$(CUDATOOLKIT_HOME)/include
+
+clean:
+       rm -f $(TARGET) *.decl.h *.def.h *.o charmrun
+
+test: all
+       $(call run, ./$(TARGET) +p2)
diff --git a/examples/charm++/shared_runtimes/raja/vecadd/README b/examples/charm++/shared_runtimes/raja/vecadd/README
new file mode 100644 (file)
index 0000000..a5f2252
--- /dev/null
@@ -0,0 +1,18 @@
+Vector Addition with Raja
+
+Performs vector addition in parallel, utilizing Raja for within-node and
+Charm++ to run multiple processes (i.e. logical nodes) that can be executed
+in a distributed memory environment.
+
+Default Raja execution is OpenMP, but can be changed to use the available
+GPU devices instead (with CUDA) by providing '-g' as a command line argument.
+
+Requires Raja to be built with GPU support.
+e.g. From Raja source folder,
+> mkdir build && install
+> cd build
+> cmake -DENABLE_CUDA=On -DCMAKE_INSTALL_PREFIX=<path to RAJA install folder> ../
+> make -j
+> make install
+
+Path to Raja install folder and GPU compute capability should be set in Makefile.common.
diff --git a/examples/charm++/shared_runtimes/raja/vecadd/vecadd.ci b/examples/charm++/shared_runtimes/raja/vecadd/vecadd.ci
new file mode 100644 (file)
index 0000000..de6c16b
--- /dev/null
@@ -0,0 +1,17 @@
+mainmodule vecadd {
+  readonly CProxy_Main mainProxy;
+  readonly CProxy_Process processProxy;
+  readonly uint64_t n;
+  readonly bool use_gpu;
+  readonly int device_cnt;
+
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+    entry [reductiontarget] void done();
+  };
+
+  nodegroup Process {
+    entry Process();
+    entry void run();
+  };
+};
diff --git a/examples/charm++/shared_runtimes/raja/vecadd/vecadd.h b/examples/charm++/shared_runtimes/raja/vecadd/vecadd.h
new file mode 100644 (file)
index 0000000..8bbe5ea
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef VECADD_H_
+#define VECADD_H_
+#include <cstdint>
+
+void vecadd(const uint64_t n, int process, bool use_gpu);
+
+#endif // VECADD_H_
diff --git a/examples/charm++/shared_runtimes/raja/vecadd/vecadd_charm.C b/examples/charm++/shared_runtimes/raja/vecadd/vecadd_charm.C
new file mode 100644 (file)
index 0000000..25ca738
--- /dev/null
@@ -0,0 +1,71 @@
+#include "vecadd.decl.h"
+#include "pup_stl.h"
+#include "vecadd.h"
+#include <unistd.h>
+#include <cuda_runtime.h>
+
+/* readonly */ CProxy_Main mainProxy;
+/* readonly */ CProxy_Process processProxy;
+/* readonly */ uint64_t n;
+/* readonly */ bool use_gpu;
+/* readonly */ int device_cnt;
+
+class Main : public CBase_Main {
+public:
+  Main(CkArgMsg* m) {
+    n = 128 * 1024 * 1024; // 128 M doubles by default
+    use_gpu = false;
+
+    // Command line parsing
+    int c;
+    while ((c = getopt(m->argc, m->argv, "n:g")) != -1) {
+      switch (c) {
+        case 'n':
+          n = atoi(optarg);
+          break;
+        case 'g':
+          use_gpu = true;
+          break;
+        default:
+          CkExit();
+      }
+    }
+
+    CkPrintf("\n[Raja + Charm++ Vector Addition]\n");
+    CkPrintf("Vector size: %lu doubles\n", n);
+    CkPrintf("Use GPU: %s\n\n", use_gpu ? "Yes" : "No");
+
+    // Check for GPUs
+    cudaGetDeviceCount(&device_cnt);
+    if (use_gpu && device_cnt <= 0) {
+      CkPrintf("CUDA capable devices not found, exiting...\n");
+      CkExit();
+    }
+
+    // Create nodegroup and run
+    processProxy = CProxy_Process::ckNew();
+    processProxy.run();
+  };
+
+  void done() {
+    CkPrintf("\nAll done\n");
+
+    CkExit();
+  };
+};
+
+class Process : public CBase_Process {
+public:
+  Process() {}
+
+  void run() {
+    // Run vector addition
+    vecadd(n, CkMyNode(), use_gpu);
+
+    // Reduce to Main to end the program
+    CkCallback cb(CkReductionTarget(Main, done), mainProxy);
+    contribute(cb);
+  }
+};
+
+#include "vecadd.def.h"
diff --git a/examples/charm++/shared_runtimes/raja/vecadd/vecadd_raja.cpp b/examples/charm++/shared_runtimes/raja/vecadd/vecadd_raja.cpp
new file mode 100644 (file)
index 0000000..2abe197
--- /dev/null
@@ -0,0 +1,95 @@
+#include "vecadd.h"
+#include "RAJA/RAJA.hpp"
+#include <cstdio>
+#include <iostream>
+#include <typeinfo>
+#include <chrono>
+
+#define CORRECT_VALUE 3.0
+
+void vecadd(const uint64_t n, int process, bool use_gpu) {
+  double* h_a;
+  double* h_b;
+  double* d_a;
+  double* d_b;
+  if (use_gpu) {
+    // Vector addition using CUDA
+    cudaErrchk(cudaMallocHost((void**)&h_a, n * sizeof(double)));
+    cudaErrchk(cudaMalloc((void**)&d_a, n * sizeof(double)));
+    cudaErrchk(cudaMalloc((void**)&d_b, n * sizeof(double)));
+
+    auto start = std::chrono::system_clock::now();
+    RAJA::forall<RAJA::cuda_exec<256>>(RAJA::RangeSegment(0, n),
+      [=] RAJA_DEVICE (int i) {
+      d_a[i] = 1.0;
+      d_b[i] = 2.0;
+    });
+    auto end = std::chrono::system_clock::now();
+    std::chrono::duration<double> elapsed = end - start;
+
+    std::cout << "[Process " << process << "] Vector initialization time on device (CUDA): " <<
+      elapsed.count() << std::endl;
+
+    start = std::chrono::system_clock::now();
+    RAJA::forall<RAJA::cuda_exec<256>>(RAJA::RangeSegment(0, n),
+      [=] RAJA_DEVICE (int i) {
+      d_a[i] += d_b[i];
+    });
+    end = std::chrono::system_clock::now();
+    elapsed = end - start;
+    std::cout << "[Process " << process << "] Vector addition time on device (CUDA): " <<
+      elapsed.count() << std::endl;
+
+    start = std::chrono::system_clock::now();
+    cudaErrchk(cudaMemcpy(h_a, d_a, n * sizeof(double), cudaMemcpyDeviceToHost));
+    end = std::chrono::system_clock::now();
+    elapsed = end - start;
+    std::cout << "[Process " << process << "] Time for device -> host data movement: " <<
+      elapsed.count() << std::endl;
+  }
+  else {
+    // Vector addition using OpenMP
+    h_a = (double*)malloc(n * sizeof(double));
+    h_b = (double*)malloc(n * sizeof(double));
+
+    auto start = std::chrono::system_clock::now();
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, n), [=] (int i) {
+      h_a[i] = 1.0;
+      h_b[i] = 2.0;
+    });
+    auto end = std::chrono::system_clock::now();
+    std::chrono::duration<double> elapsed = end - start;
+    std::cout << "[Process " << process << "] Vector initialization time on host (OpenMP): " <<
+      elapsed.count() << std::endl;
+
+    start = std::chrono::system_clock::now();
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, n), [=] (int i) {
+      h_a[i] += h_b[i];
+    });
+    end = std::chrono::system_clock::now();
+    elapsed = end - start;
+    std::cout << "[Process " << process << "] Time on host (OpenMP): " <<
+      elapsed.count() << std::endl;
+  }
+
+  // Validate last element of the vector
+  double last_elem = h_a[n-1];
+  if (abs(last_elem - CORRECT_VALUE) < 0.000001) {
+    std::cout << "[Process " << process << "] Last element validated" << std::endl;
+  }
+  else {
+    std::cout << "[Process " << process << "] Last element NOT validated: it is " <<
+      last_elem << ", but should be " << CORRECT_VALUE << std::endl;
+  }
+
+  // Free allocated memory
+  if (use_gpu) {
+    cudaErrchk(cudaFreeHost(h_a));
+    cudaErrchk(cudaFree(d_a));
+    cudaErrchk(cudaFree(d_b));
+  }
+  else {
+    free(h_a);
+    free(h_b);
+  }
+}