Add Kokkos vector addition example and move examples to new directory 75/4975/9
authorJaemin Choi <jchoi157@illinois.edu>
Mon, 25 Feb 2019 06:59:12 +0000 (01:59 -0500)
committerRonak Buch <rabuch2@illinois.edu>
Thu, 21 Mar 2019 20:27:35 +0000 (15:27 -0500)
Change-Id: I22b83d5b8a4cbc78a3186a9916afa622abd0758c

17 files changed:
examples/charm++/kokkos/hello-dist/Makefile [deleted file]
examples/charm++/kokkos/hello-dist/README [deleted file]
examples/charm++/kokkos/hello/Makefile [deleted file]
examples/charm++/kokkos/hello/README [deleted file]
examples/charm++/kokkos/hello/hello.C [deleted file]
examples/charm++/kokkos/hello/hello.ci [deleted file]
examples/charm++/shared_runtimes/kokkos/Makefile.common [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/hello/Makefile [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/hello/README [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/hello/hello.C [moved from examples/charm++/kokkos/hello-dist/hello.C with 97% similarity]
examples/charm++/shared_runtimes/kokkos/hello/hello.ci [moved from examples/charm++/kokkos/hello-dist/hello.ci with 100% similarity]
examples/charm++/shared_runtimes/kokkos/vecadd/Makefile [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/vecadd/README [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/vecadd/vecadd.ci [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/vecadd/vecadd.h [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/vecadd/vecadd_charm.C [new file with mode: 0644]
examples/charm++/shared_runtimes/kokkos/vecadd/vecadd_kokkos.cpp [new file with mode: 0644]

diff --git a/examples/charm++/kokkos/hello-dist/Makefile b/examples/charm++/kokkos/hello-dist/Makefile
deleted file mode 100644 (file)
index 2d9e7c5..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
--include ../../../common.mk
-CHARMC = ../../../../bin/charmc $(OPTS)
-
-# Paths need to be set for include and lib folders of Kokkos
-# KOKKOS_INC and KOKKOS_LIB respectively
-KOKKOS_INC ?= /home/jchoi157/kokkos/build/include
-KOKKOS_LIB ?= /home/jchoi157/kokkos/build/lib
-
-OBJS = hello.o
-
-all: hello
-
-hello: $(OBJS)
-       $(CHARMC) -language charm++ -L$(KOKKOS_LIB) -lkokkos -lpthread -o hello $(OBJS)
-
-hello.decl.h: hello.ci
-       $(CHARMC) hello.ci
-
-clean:
-       rm -f *.decl.h *.def.h *.o hello charmrun
-
-hello.o: hello.C hello.decl.h
-       $(CHARMC) -I$(KOKKOS_INC) -c hello.C
-
-test: all
-       $(call run, ./hello +p2 ++local)
diff --git a/examples/charm++/kokkos/hello-dist/README b/examples/charm++/kokkos/hello-dist/README
deleted file mode 100644 (file)
index c5b287f..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-Hello World with Kokkos (Distributed)
-
-Demonstrates basic interoperability with Kokkos.
-Paths to include and lib folders of Kokkos (KOKKOS_INC and KOKKOS_LIB, respecitvely)
-need to be set as environment variables or provided to the Makefile.
-Uses nodegroup to allow a separate instance of Kokkos to run on each process.
-The tricky part is getting argc and argv fed correctly to Kokkos::initialize().
diff --git a/examples/charm++/kokkos/hello/Makefile b/examples/charm++/kokkos/hello/Makefile
deleted file mode 100644 (file)
index 12eb2e7..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
--include ../../../common.mk
-CHARMC = ../../../../bin/charmc $(OPTS)
-
-# Paths need to be set for include and lib folders of Kokkos
-# KOKKOS_INC and KOKKOS_LIB respectively
-KOKKOS_INC ?= /home/jchoi157/kokkos/build/include
-KOKKOS_LIB ?= /home/jchoi157/kokkos/build/lib
-
-OBJS = hello.o
-
-all: hello
-
-hello: $(OBJS)
-       $(CHARMC) -language charm++ -L$(KOKKOS_LIB) -lkokkos -lpthread -o hello $(OBJS)
-
-hello.decl.h: hello.ci
-       $(CHARMC) hello.ci
-
-clean:
-       rm -f *.decl.h *.def.h *.o hello charmrun
-
-hello.o: hello.C hello.decl.h
-       $(CHARMC) -I$(KOKKOS_INC) -c hello.C
-
-test: all
-       $(call run, ./hello)
diff --git a/examples/charm++/kokkos/hello/README b/examples/charm++/kokkos/hello/README
deleted file mode 100644 (file)
index c3334dd..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-Hello World with Kokkos
-
-Demonstrates basic interoperability with Kokkos.
-Paths to include and lib folders of Kokkos (KOKKOS_INC and KOKKOS_LIB, respecitvely)
-need to be set as environment variables or provided to the Makefile.
diff --git a/examples/charm++/kokkos/hello/hello.C b/examples/charm++/kokkos/hello/hello.C
deleted file mode 100644 (file)
index 6f18eef..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <stdio.h>
-#include "hello.decl.h"
-#include <Kokkos_Core.hpp>
-#include <typeinfo>
-
-/* readonly */ CProxy_Main mainProxy;
-
-class Main : public CBase_Main {
-public:
-  Main(CkArgMsg* m) {
-    // Initialize Kokkos
-    Kokkos::initialize(m->argc, m->argv);
-
-    CkPrintf("Hello World on Kokkos execution space %s\n",
-             typeid(Kokkos::DefaultExecutionSpace).name());
-
-    // Parallel execution with Kokkos
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-    Kokkos::parallel_for(16, KOKKOS_LAMBDA (const int i) {
-      printf("Hello from i = %i\n", i);
-    });
-#endif
-
-    done();
-  };
-
-  void done()
-  {
-    CkPrintf("All done\n");
-
-    // Finalize Kokkos and exit
-    Kokkos::finalize();
-    CkExit();
-  };
-};
-
-#include "hello.def.h"
diff --git a/examples/charm++/kokkos/hello/hello.ci b/examples/charm++/kokkos/hello/hello.ci
deleted file mode 100644 (file)
index ec57df1..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-mainmodule hello {
-  readonly CProxy_Main mainProxy;
-
-  mainchare Main {
-    entry Main(CkArgMsg *m);
-    entry void done();
-  };
-};
diff --git a/examples/charm++/shared_runtimes/kokkos/Makefile.common b/examples/charm++/shared_runtimes/kokkos/Makefile.common
new file mode 100644 (file)
index 0000000..7c85000
--- /dev/null
@@ -0,0 +1,13 @@
+# Absolute paths to Kokkos build folders should be set here.
+KOKKOS_OMP_BUILD_PATH ?= /home/kokkos/build-omp
+KOKKOS_CUDA_BUILD_PATH ?= /home/kokkos/build-cuda
+
+KOKKOS_OMP_INC = $(KOKKOS_OMP_BUILD_PATH)/include
+KOKKOS_OMP_LIB = $(KOKKOS_OMP_BUILD_PATH)/lib
+KOKKOS_CUDA_INC = $(KOKKOS_CUDA_BUILD_PATH)/include
+KOKKOS_CUDA_LIB = $(KOKKOS_CUDA_BUILD_PATH)/lib
+
+NVCC_WRAPPER = $(KOKKOS_CUDA_BUILD_PATH)/bin/nvcc_wrapper
+NVCC_OPTS = -O3 -fopenmp -Wfatal-errors
+
+CHARMC = ../../../../../bin/charmc $(OPTS)
diff --git a/examples/charm++/shared_runtimes/kokkos/hello/Makefile b/examples/charm++/shared_runtimes/kokkos/hello/Makefile
new file mode 100644 (file)
index 0000000..d37cc4b
--- /dev/null
@@ -0,0 +1,21 @@
+-include ../../../../common.mk
+-include ../Makefile.common
+
+TARGET = hello
+
+all: $(TARGET)
+
+$(TARGET): $(TARGET).o
+       $(CHARMC) -language charm++ -fopenmp -L$(KOKKOS_OMP_LIB) -lkokkos -o $@ $^
+
+$(TARGET).o: $(TARGET).C $(TARGET).decl.h
+       $(CHARMC) -fopenmp -I$(KOKKOS_OMP_INC) -c $<
+
+$(TARGET).decl.h: $(TARGET).ci
+       $(CHARMC) $<
+
+clean:
+       rm -f $(TARGET) *.decl.h *.def.h *.o charmrun
+
+test: all
+       $(call run, ./$(TARGET) +p2)
diff --git a/examples/charm++/shared_runtimes/kokkos/hello/README b/examples/charm++/shared_runtimes/kokkos/hello/README
new file mode 100644 (file)
index 0000000..e45e051
--- /dev/null
@@ -0,0 +1,16 @@
+Hello World with Kokkos
+
+Demonstrates basic interoperability with Kokkos, using Charm++ nodegroup
+to spawn a separate instance of Kokkos on each process. It also shows how
+command line arguments can be packed to initialize Kokkos on all the processes.
+
+Requires OpenMP build of Kokkos for execution.
+e.g. From Kokkos source folder,
+> mkdir build-omp
+> cd build-omp
+> ../generate_makefile.bash --prefix=<absolute path to build-omp> --with-openmp
+                            --arch=BDW
+> make -j kokkoslib
+> make install
+
+Path to OpenMP build of Kokkos should be set in Makefile.common.
similarity index 97%
rename from examples/charm++/kokkos/hello-dist/hello.C
rename to examples/charm++/shared_runtimes/kokkos/hello/hello.C
index b28910f1a4c57b7cf4a2d6925788a6b4d2073de9..6fbc1e8781c55130bf47725a9e87e9dfc3a997b1 100644 (file)
@@ -13,7 +13,7 @@
 class Main : public CBase_Main {
 public:
   Main(CkArgMsg* m) {
-    // Pack arguments
+    // Pack arguments (optional)
     std::vector<std::string> args;
     for (int i = 0; i < m->argc; i++) {
       args.push_back(std::string(m->argv[i]));
diff --git a/examples/charm++/shared_runtimes/kokkos/vecadd/Makefile b/examples/charm++/shared_runtimes/kokkos/vecadd/Makefile
new file mode 100644 (file)
index 0000000..100abe7
--- /dev/null
@@ -0,0 +1,24 @@
+-include ../../../../common.mk
+-include ../Makefile.common
+
+TARGET = vecadd
+
+all: $(TARGET)
+
+$(TARGET): $(TARGET)_kokkos.o $(TARGET)_charm.o
+       $(CHARMC) -language charm++ -fopenmp -L$(KOKKOS_CUDA_LIB) -lkokkos -L$(CUDATOOLKIT_HOME)/lib64 -lcudart -o $@ $^
+
+$(TARGET)_kokkos.o: $(TARGET)_kokkos.cpp $(TARGET).h
+       $(NVCC_WRAPPER) $(NVCC_OPTS) -c -I$(KOKKOS_CUDA_INC) $<
+
+$(TARGET).decl.h: $(TARGET).ci
+       $(CHARMC) $<
+
+$(TARGET)_charm.o: $(TARGET)_charm.C $(TARGET).decl.h $(TARGET).h
+       $(CHARMC) -c $< -I$(KOKKOS_CUDA_INC) -I$(CUDATOOLKIT_HOME)/include
+
+clean:
+       rm -f $(TARGET) *.decl.h *.def.h *.o charmrun
+
+test: all
+       $(call run, ./$(TARGET) +p2)
diff --git a/examples/charm++/shared_runtimes/kokkos/vecadd/README b/examples/charm++/shared_runtimes/kokkos/vecadd/README
new file mode 100644 (file)
index 0000000..bbd0332
--- /dev/null
@@ -0,0 +1,22 @@
+Vector Addition with Kokkos
+
+Performs vector addition in parallel, utilizing Kokkos for within-node and
+Charm++ to run multiple processes (i.e. logical nodes) that can be executed
+in a distributed memory environment.
+
+Default Kokkos execution is OpenMP, but can be changed to use the available
+GPU devices instead (with CUDA) by providing '-g' as a command line argument.
+
+Requires Kokkos to be built for both OpenMP and CUDA.
+e.g. From Kokkos source folder,
+> mkdir build-cuda
+> cd build-cuda
+> generate_makefile.bash --prefix=<absolute path to build-cuda>
+                         --with-cuda=<path to CUDA toolkit>
+                         --with-cuda-options=enable_lambda
+                         --with-openmp --arch=BDW,Pascal60
+                         --compiler=<path to included NVCC wrapper>
+> make -j kokkoslib
+> make install
+
+Path to OpenMP + CUDA build of Kokkos should be set in Makefile.common.
diff --git a/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd.ci b/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd.ci
new file mode 100644 (file)
index 0000000..de6c16b
--- /dev/null
@@ -0,0 +1,17 @@
+mainmodule vecadd {
+  readonly CProxy_Main mainProxy;
+  readonly CProxy_Process processProxy;
+  readonly uint64_t n;
+  readonly bool use_gpu;
+  readonly int device_cnt;
+
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+    entry [reductiontarget] void done();
+  };
+
+  nodegroup Process {
+    entry Process();
+    entry void run();
+  };
+};
diff --git a/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd.h b/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd.h
new file mode 100644 (file)
index 0000000..689c557
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef VECADD_H_
+#define VECADD_H_
+#include <cstdint>
+
+void kokkosInit();
+void kokkosInit(int device_id);
+void kokkosFinalize();
+void vecadd(const uint64_t n, int process, bool use_gpu);
+
+#endif // VECADD_H_
diff --git a/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd_charm.C b/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd_charm.C
new file mode 100644 (file)
index 0000000..7ba25ce
--- /dev/null
@@ -0,0 +1,87 @@
+#include "vecadd.decl.h"
+#include "pup_stl.h"
+#include "vecadd.h"
+#include <unistd.h>
+#include <cuda_runtime.h>
+
+/* readonly */ CProxy_Main mainProxy;
+/* readonly */ CProxy_Process processProxy;
+/* readonly */ uint64_t n;
+/* readonly */ bool use_gpu;
+/* readonly */ int device_cnt;
+
+class Main : public CBase_Main {
+public:
+  Main(CkArgMsg* m) {
+    n = 128 * 1024 * 1024; // 128 M doubles by default
+    use_gpu = false;
+
+    // Command line parsing
+    int c;
+    while ((c = getopt(m->argc, m->argv, "n:g")) != -1) {
+      switch (c) {
+        case 'n':
+          n = atoi(optarg);
+          break;
+        case 'g':
+          use_gpu = true;
+          break;
+        default:
+          CkExit();
+      }
+    }
+
+    CkPrintf("\n[Kokkos + Charm++ Vector Addition]\n");
+    CkPrintf("Vector size: %lu doubles\n", n);
+    CkPrintf("Use GPU: %s\n\n", use_gpu ? "Yes" : "No");
+
+    // Check for GPUs
+    cudaGetDeviceCount(&device_cnt);
+    if (use_gpu && device_cnt <= 0) {
+      CkPrintf("CUDA capable devices not found, exiting...\n");
+      CkExit();
+    }
+
+    // Create nodegroup and run
+    processProxy = CProxy_Process::ckNew();
+    processProxy.run();
+  };
+
+  void done() {
+    CkPrintf("\nAll done\n");
+
+    CkExit();
+  };
+};
+
+class Process : public CBase_Process {
+public:
+  Process() {
+    // Initialize Kokkos. Needs to be done on every process
+    if (use_gpu) {
+      // Figure out which GPU this process should be mapped to in round-robin.
+      int processes_per_node = CkNumNodes() / CmiNumPhysicalNodes();
+      int local_pid = CkMyNode() % processes_per_node;
+      int my_gpu = local_pid % device_cnt;
+
+      kokkosInit(my_gpu);
+    }
+    else {
+      kokkosInit();
+    }
+  }
+
+  void run() {
+    // Run vector addition
+    vecadd(n, CkMyNode(), use_gpu);
+
+    // Finialize Kokkos. Needs to be done on every process
+    kokkosFinalize();
+
+    // Reduce to Main to end the program
+    CkCallback cb(CkReductionTarget(Main, done), mainProxy);
+    contribute(cb);
+  }
+};
+
+#include "vecadd.def.h"
diff --git a/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd_kokkos.cpp b/examples/charm++/shared_runtimes/kokkos/vecadd/vecadd_kokkos.cpp
new file mode 100644 (file)
index 0000000..22d5bde
--- /dev/null
@@ -0,0 +1,115 @@
+#include "vecadd.h"
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <iostream>
+#include <typeinfo>
+#include <impl/Kokkos_Timer.hpp>
+
+#define CORRECT_VALUE 3.0
+
+// Column-major layout on GPU for coalesced accesses
+typedef Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> CudaView;
+typedef Kokkos::View<double*,Kokkos::LayoutRight,Kokkos::CudaHostPinnedSpace> HostView;
+
+// Functors
+template <typename ViewType>
+struct Fill {
+  double value;
+  ViewType a;
+
+  Fill(const double& val, const ViewType& d_a) : value(val), a(d_a) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) = value;
+  }
+};
+
+template <typename ViewType>
+struct Compute {
+  ViewType a, b;
+
+  Compute(const ViewType& d_a, const ViewType& d_b) : a(d_a), b(d_b) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) += b(i);
+  }
+};
+
+void kokkosInit() {
+  Kokkos::initialize();
+}
+
+void kokkosInit(int device_id) {
+  Kokkos::InitArguments args;
+  args.device_id = device_id;
+  Kokkos::initialize(args);
+}
+
+void kokkosFinalize() {
+  Kokkos::finalize();
+}
+
+void vecadd(const uint64_t n, int process, bool use_gpu) {
+#ifdef DEBUG
+  std::cout << "[Process " << process << "] " << "Default execution space: " <<
+    typeid(Kokkos::DefaultExecutionSpace).name() << std::endl;
+  std::cout << "[Process " << process << "] " << "Default host execution space: " <<
+    typeid(Kokkos::DefaultHostExecutionSpace).name() << std::endl;
+#endif
+
+  HostView h_a("Host A", n); // Used for validation with CUDA
+  if (use_gpu) {
+    // Vector addition using CUDA
+    CudaView d_a("Device A", n);
+    CudaView d_b("Device B", n);
+
+    Kokkos::Timer timer;
+
+    Kokkos::parallel_for (Kokkos::RangePolicy<Kokkos::Cuda>(0, n), Fill<CudaView>(1.0, d_a));
+    Kokkos::parallel_for (Kokkos::RangePolicy<Kokkos::Cuda>(0, n), Fill<CudaView>(2.0, d_b));
+    Kokkos::fence();
+    std::cout << "[Process " << process << "] Vector initialization time on device (CUDA): " <<
+      timer.seconds() << std::endl;
+
+    timer.reset();
+    Kokkos::parallel_for (Kokkos::RangePolicy<Kokkos::Cuda>(0, n), Compute<CudaView>(d_a, d_b));
+    Kokkos::fence();
+    std::cout << "[Process " << process << "] Vector addition time on device (CUDA): " <<
+      timer.seconds() << std::endl;
+
+    timer.reset();
+    Kokkos::deep_copy(h_a, d_a);
+    std::cout << "[Process " << process << "] Time for device -> host data movement: " <<
+      timer.seconds() << std::endl;
+  }
+  else {
+    // Vector addition using OpenMP
+    HostView h_b("Host B", n);
+
+    Kokkos::Timer timer;
+
+    Kokkos::parallel_for (Kokkos::RangePolicy<Kokkos::OpenMP>(0, n), Fill<HostView>(1.0, h_a));
+    Kokkos::parallel_for (Kokkos::RangePolicy<Kokkos::OpenMP>(0, n), Fill<HostView>(2.0, h_b));
+    Kokkos::fence();
+    std::cout << "[Process " << process << "] Vector initialization time on host (OpenMP): " <<
+      timer.seconds() << std::endl;
+
+    timer.reset();
+    Kokkos::parallel_for (Kokkos::RangePolicy<Kokkos::OpenMP>(0, n), Compute<HostView>(h_a, h_b));
+    Kokkos::fence();
+    std::cout << "[Process " << process << "] Time on host (OpenMP): " <<
+      timer.seconds() << std::endl;
+  }
+
+  // Validate last element of the vector
+  uint64_t last_elem = h_a(n-1);
+  if (abs(last_elem - CORRECT_VALUE) < 0.000001) {
+    std::cout << "[Process " << process << "] Last element validated" << std::endl;
+  }
+  else {
+    std::cout << "[Process " << process << "] Last element NOT validated: it is " <<
+      last_elem << ", but should be " << CORRECT_VALUE << std::endl;
+  }
+}