Added examples for parallel execution of matrix multiplication kernels
authorLukasz Wesolowski <wesolwsk@talent.cs.uiuc.edu>
Tue, 23 Feb 2010 22:17:07 +0000 (16:17 -0600)
committerLukasz Wesolowski <wesolwsk@talent.cs.uiuc.edu>
Tue, 23 Feb 2010 22:17:07 +0000 (16:17 -0600)
using (1) GPU Manager and (2) CUDA streams.

13 files changed:
examples/charm++/cuda/gpuManager/overlapTestGPUManager/.nfs000000000301568300000013 [new file with mode: 0755]
examples/charm++/cuda/gpuManager/overlapTestGPUManager/Makefile [new file with mode: 0644]
examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.C [new file with mode: 0644]
examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.ci [new file with mode: 0644]
examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.cu [new file with mode: 0644]
examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.h [new file with mode: 0644]
examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTestConsts.h [new file with mode: 0644]
examples/charm++/cuda/overlapTestStream/Makefile [new file with mode: 0644]
examples/charm++/cuda/overlapTestStream/overlapTest.C [new file with mode: 0644]
examples/charm++/cuda/overlapTestStream/overlapTest.ci [new file with mode: 0644]
examples/charm++/cuda/overlapTestStream/overlapTest.cu [new file with mode: 0644]
examples/charm++/cuda/overlapTestStream/overlapTest.h [new file with mode: 0644]
examples/charm++/cuda/overlapTestStream/overlapTestConsts.h [new file with mode: 0644]

diff --git a/examples/charm++/cuda/gpuManager/overlapTestGPUManager/.nfs000000000301568300000013 b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/.nfs000000000301568300000013
new file mode 100755 (executable)
index 0000000..944bbf3
Binary files /dev/null and b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/.nfs000000000301568300000013 differ
diff --git a/examples/charm++/cuda/gpuManager/overlapTestGPUManager/Makefile b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/Makefile
new file mode 100644 (file)
index 0000000..23c28b9
--- /dev/null
@@ -0,0 +1,30 @@
+CHARMC=../../../../../net-linux-amd64-cuda/bin/charmc $(OPTS)
+
+OBJS = overlapTest.o overlapTestCU.o
+
+NVCC = /usr/local/cuda/bin/nvcc 
+NVCC_FLAGS = -O3 -c -use_fast_math #-device-debug -deviceemu 
+NVCC_INC = -I/usr/local/cuda/include -I/usr/local/NVIDIA_GPU_Computing_SDK/C/common/inc -I../../../../../src/arch/cuda/hybridAPI
+
+#LD_LIBS += -lcuda -lcudart -lGL -lGLU -L../../../../NVIDIA_CUDA_SDK/lib -lcutil
+
+LD_LIBS += -L/usr/local/NVIDIA_GPU_Computing_SDK/C/lib -lcutil_x86_64
+
+export LD_RUN_PATH = /usr/local/cuda/lib64
+
+all: overlapTest
+
+overlapTest: $(OBJS)
+       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS) -tracemode projections
+
+overlapTest.decl.h: overlapTest.ci
+       $(CHARMC)  overlapTest.ci
+
+clean:
+       rm -f *.decl.h *.def.h conv-host *.o overlapTest charmrun
+
+overlapTest.o: overlapTest.C overlapTest.decl.h
+       $(CHARMC) -O3 -c overlapTest.C
+
+overlapTestCU.o: overlapTest.cu
+       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) -o overlapTestCU.o overlapTest.cu
diff --git a/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.C b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.C
new file mode 100644 (file)
index 0000000..c94c46c
--- /dev/null
@@ -0,0 +1,125 @@
+#include "overlapTest.decl.h"
+#include "overlapTest.h"
+
+// #define DEBUG
+
+extern void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C, int myIndex, void *cb); 
+extern void hostMemorySetup(int matrixSize, ElementType **h_A, ElementType **h_B, ElementType **h_C, void *cb); 
+extern void hostMemoryCleanup(ElementType *h_A, ElementType *h_B, ElementType *h_C);
+
+CProxy_Main mainProxy; 
+int matrixSize;
+
+Main::Main(CkArgMsg *m) {
+  mainProxy = thisProxy; 
+
+  if (m->argc >= 2) {
+    numChares = atoi(m->argv[1]); 
+  }
+  if (m->argc == 3) {
+    matrixSize = atoi(m->argv[2]); 
+  }
+  delete m;
+
+  workers = CProxy_Workers::ckNew(numChares); 
+
+  startTime = CmiWallTimer(); 
+    
+  workers.setupBuffers(); 
+}
+
+void Main::finishWork(CkReductionMsg *m) {
+  delete m;
+  CkPrintf("Elapsed time: %f s\n", CmiWallTimer() - startTime);  
+  CkExit(); 
+}
+
+Workers::Workers() {
+  int size = matrixSize * matrixSize; 
+  A = new ElementType[size];
+  B = new ElementType[size];
+  C = new ElementType[size]; 
+  
+  randomInit(A, size); 
+  randomInit(B, size); 
+}
+
+Workers::~Workers() {
+  delete [] A; 
+  delete [] B; 
+  delete [] C; 
+  hostMemoryCleanup(h_A, h_B, h_C);
+}
+
+Workers::Workers(CkMigrateMessage *msg) { } 
+
+void Workers::setupBuffers() {
+  CkArrayIndex1D myIndex = CkArrayIndex1D(thisIndex);
+  CkCallback *cb = new CkCallback(CkIndex_Workers::beginWork(), myIndex, thisArrayID);
+  hostMemorySetup(matrixSize, &h_A, &h_B, &h_C, (void *) cb); 
+}
+
+void Workers::beginWork() {
+  CkCallback *cb;
+  CkArrayIndex1D myIndex = CkArrayIndex1D(thisIndex);
+  cb = new CkCallback(CkIndex_Workers::complete(), myIndex, thisArrayID); 
+  int size = matrixSize * matrixSize * sizeof(ElementType);
+  memcpy(h_A, A, size);
+  memcpy(h_B, B, size); 
+  cudaMatMul(matrixSize, h_A, h_B, h_C, thisIndex, (void *) cb);
+}
+
+void Workers::complete() {
+  int size = matrixSize * matrixSize * sizeof(ElementType); 
+  memcpy(C, h_C, size); 
+
+#ifdef DEBUG
+  CkPrintf("[%d] A\n", thisIndex); 
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      CkPrintf("%.2f ", A[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+  CkPrintf("[%d] B\n", thisIndex); 
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      CkPrintf("%.2f ", B[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+  CkPrintf("[%d] C\n", thisIndex); 
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      CkPrintf("%.2f ", C[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+  CkPrintf("[%d] C-gold\n", thisIndex);
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      C[i*matrixSize + j] = 0; 
+      for (int k=0; k<matrixSize; k++) {
+       C[i*matrixSize + j] += A[i*matrixSize +k] * B[k * matrixSize + j];
+      }
+      CkPrintf("%.2f ", C[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+
+#endif
+
+  contribute(CkCallback(CkIndex_Main::finishWork(NULL), mainProxy));
+}
+
+void randomInit(ElementType* data, int size) {
+  for (int i = 0; i < size; ++i) {
+    data[i] = rand() / (ElementType)RAND_MAX;
+  }
+}
+
+#include "overlapTest.def.h"
diff --git a/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.ci b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.ci
new file mode 100644 (file)
index 0000000..206dc7e
--- /dev/null
@@ -0,0 +1,18 @@
+mainmodule overlapTest {
+  readonly int matrixSize; 
+  readonly CProxy_Main mainProxy; 
+
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+    entry void finishWork(CkReductionMsg *m); 
+  };
+
+  array [1D] Workers {
+    entry Workers(); 
+    entry void setupBuffers();
+    entry void beginWork(); 
+    entry void complete();
+  };
+
+};
+
diff --git a/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.cu b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.cu
new file mode 100644 (file)
index 0000000..ef83ac3
--- /dev/null
@@ -0,0 +1,216 @@
+#include "overlapTestConsts.h"
+#include "wr.h"
+#include <stdio.h>
+
+#if CHECK_BANK_CONFLICTS
+#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
+#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
+#else
+#define AS(i, j) As[i][j]
+#define BS(i, j) Bs[i][j]
+#endif
+
+// matrix multiplication code taken from the CUDA SDK
+
+__global__ void
+matrixMul(float* C, float* A, float* B, int wA, int wB)
+{
+  for (int i=0; i<1000000; i++) {
+    C[blockIdx.x * BLOCK_SIZE + threadIdx.x] ++; 
+    C[blockIdx.x * BLOCK_SIZE + threadIdx.x] --; 
+  }
+  /*
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * by;
+
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * bx;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    float Csub = 0;
+
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin;
+             a <= aEnd;
+             a += aStep, b += bStep) {
+
+        // Declaration of the shared memory array As used to
+        // store the sub-matrix of A
+        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+        // Declaration of the shared memory array Bs used to
+        // store the sub-matrix of B
+        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        AS(ty, tx) = A[a + wA * ty + tx];
+        BS(ty, tx) = B[b + wB * ty + tx];
+
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+        for (int k = 0; k < BLOCK_SIZE; ++k)
+            Csub += AS(ty, k) * BS(k, tx);
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+    C[c + wB * ty + tx] = Csub;
+  */
+}
+
+void hostMemorySetup(int matrixSize, ElementType **h_A_ptr, 
+                    ElementType **h_B_ptr, ElementType **h_C_ptr, void *cb) {
+  pinnedMemReq reqs; 
+
+  int nBuffers = 3; 
+  int size = matrixSize * matrixSize * sizeof(ElementType); 
+
+  size_t *sizes = (size_t *) malloc(nBuffers * sizeof(size_t));
+  void ***hostPtrs = (void ***) malloc(nBuffers * sizeof(void **)); 
+  hostPtrs[0] = (void **) h_A_ptr; 
+  hostPtrs[1] = (void **) h_B_ptr; 
+  hostPtrs[2] = (void **) h_C_ptr; 
+  sizes[0] = size; 
+  sizes[1] = size;
+  sizes[2] = size; 
+
+  reqs.nBuffers = nBuffers; 
+  reqs.sizes = sizes; 
+  reqs.hostPtrs = hostPtrs; 
+  reqs.callbackFn = cb; 
+
+  pinnedMallocHost(&reqs); 
+}
+
+void hostMemoryCleanup(ElementType *h_A, ElementType *h_B, ElementType *h_C) {
+  cudaFreeHost(h_A); 
+  cudaFreeHost(h_B); 
+  cudaFreeHost(h_C);
+}
+
+void cudaMatMul(int matrixSize, ElementType *h_A, ElementType *h_B, 
+               ElementType *h_C, int myIndex, void *cb) {
+  int size = matrixSize * matrixSize * sizeof(ElementType);
+  dataInfo *AInfo, *BInfo, *CInfo; 
+
+  workRequest matmul; 
+  dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
+  matmul.dimGrid = dim3(matrixSize / threads.x, matrixSize / threads.y);
+  matmul.dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE); 
+  matmul.smemSize = 0; 
+  matmul.nBuffers = 3; 
+  matmul.bufferInfo = (dataInfo *) malloc(matmul.nBuffers * sizeof(dataInfo));
+
+  AInfo = &(matmul.bufferInfo[0]); 
+  AInfo->bufferID = BUFFERS_PER_CHARE * myIndex + A_INDEX;
+  AInfo->transferToDevice = YES; 
+  AInfo->transferFromDevice = NO;
+  AInfo->freeBuffer = YES;
+  AInfo->hostBuffer = h_A; 
+  AInfo->size = size; 
+
+  BInfo = &(matmul.bufferInfo[1]); 
+  BInfo->bufferID = BUFFERS_PER_CHARE * myIndex + B_INDEX;
+  BInfo->transferToDevice = YES; 
+  BInfo->transferFromDevice = NO;
+  BInfo->freeBuffer = YES;
+  BInfo->hostBuffer = h_B; 
+  BInfo->size = size; 
+
+  CInfo = &(matmul.bufferInfo[2]); 
+  CInfo->bufferID = BUFFERS_PER_CHARE * myIndex + C_INDEX;
+  CInfo->transferToDevice = NO; 
+  CInfo->transferFromDevice = YES;
+  CInfo->freeBuffer = YES;
+  CInfo->hostBuffer = h_C; 
+  CInfo->size = size; 
+
+  matmul.callbackFn = cb;
+  matmul.id = MATMUL_KERNEL;
+
+  matmul.userData = malloc(sizeof(int)); 
+  memcpy(matmul.userData, &matrixSize, sizeof(int)); 
+
+  enqueue(wrQueue, &matmul); 
+
+  /*
+  cudaStream_t stream; 
+  cudaStreamCreate(&stream); 
+  ElementType *h_A, *h_B, *h_C; 
+  ElementType *d_A, *d_B, *d_C;
+
+
+  cudaMalloc((void **) &d_A, size);
+  cudaMalloc((void **) &d_B, size);
+  cudaMalloc((void **) &d_C, size);
+
+  cudaMemcpyAsync(d_A, A, size, cudaMemcpyHostToDevice, stream); 
+  cudaMemcpyAsync(d_B, B, size, cudaMemcpyHostToDevice, stream); 
+
+  dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
+  dim3 grid(matrixSize / threads.x, matrixSize / threads.y);
+  
+  // execute the kernel
+  matrixMul<<< grid, threads, 0, stream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
+
+  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream); 
+
+  memcpy(C, h_C, size);
+
+  cudaStreamSynchronize(stream); 
+
+  cudaFreeHost(h_A);
+  cudaFreeHost(h_B);
+  cudaFreeHost(h_C);
+  
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+
+  cudaStreamDestroy(stream); 
+  */
+}
+
+void kernelSelect(workRequest *wr) {
+
+  switch (wr->id) {
+  case MATMUL_KERNEL: 
+    matrixMul<<< wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream >>>
+      ((ElementType *) devBuffers[wr->bufferInfo[C_INDEX].bufferID], 
+       (ElementType *) devBuffers[wr->bufferInfo[A_INDEX].bufferID], 
+       (ElementType *) devBuffers[wr->bufferInfo[B_INDEX].bufferID],
+       *((int *) wr->userData), *((int *) wr->userData)); 
+    break;    
+  }
+}
diff --git a/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.h b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.h
new file mode 100644 (file)
index 0000000..a4576a0
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef __OVERLAP_TEST_H
+#define __OVERLAP_TEST_H
+
+#include "overlapTestConsts.h"
+
+class Main : public CBase_Main {
+ private:
+  CProxy_Workers workers; 
+  int numChares; 
+  double startTime; 
+  
+ public:
+  Main(CkArgMsg *m);
+  void finishWork(CkReductionMsg *m); 
+};
+
+
+class Workers: public CBase_Workers {
+ private:
+  ElementType *A;
+  ElementType *B; 
+  ElementType *C; 
+  
+  ElementType *h_A; 
+  ElementType *h_B; 
+  ElementType *h_C; 
+  
+ public:
+  Workers(); 
+  ~Workers();
+  Workers(CkMigrateMessage *msg);
+  void setupBuffers(); 
+  void beginWork(); 
+  void complete();
+};
+
+void randomInit(ElementType *data, int size);
+
+#endif
diff --git a/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTestConsts.h b/examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTestConsts.h
new file mode 100644 (file)
index 0000000..b304a88
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef __OVERLAP_TEST_CONSTS_H
+#define __OVERLAP_TEST_CONSTS_H
+
+typedef float ElementType; 
+#define BLOCK_SIZE 8
+#define CHECK_BANK_CONFLICTS 0
+#define MATMUL_KERNEL 1000
+
+#define BUFFERS_PER_CHARE 3
+
+#define A_INDEX 0
+#define B_INDEX 1
+#define C_INDEX 2
+
+#define NO 0
+#define YES 1 
+
+#endif
diff --git a/examples/charm++/cuda/overlapTestStream/Makefile b/examples/charm++/cuda/overlapTestStream/Makefile
new file mode 100644 (file)
index 0000000..235a701
--- /dev/null
@@ -0,0 +1,28 @@
+CHARMC=../../../../net-linux-amd64/bin/charmc $(OPTS)
+
+OBJS = overlapTest.o overlapTestCU.o
+
+NVCC = /usr/local/cuda/bin/nvcc 
+NVCC_FLAGS = -O3 -c -use_fast_math #-device-debug -deviceemu 
+NVCC_INC = -I/usr/local/cuda/include -I../../../../NVIDIA_CUDA_SDK/common/inc -I../../charm/src/arch/cuda/hybridAPI/
+
+LD_LIBS += -lcuda -lcudart -lGL -lGLU -L../../../../NVIDIA_CUDA_SDK/lib
+
+#export LD_RUN_PATH = /usr/local/cuda/lib
+
+all: overlapTest
+
+overlapTest: $(OBJS)
+       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS) -tracemode projections
+
+overlapTest.decl.h: overlapTest.ci
+       $(CHARMC)  overlapTest.ci
+
+clean:
+       rm -f *.decl.h *.def.h conv-host *.o overlapTest charmrun
+
+overlapTest.o: overlapTest.C overlapTest.decl.h
+       $(CHARMC) -O3 -c overlapTest.C
+
+overlapTestCU.o: overlapTest.cu
+       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) -o overlapTestCU.o overlapTest.cu
diff --git a/examples/charm++/cuda/overlapTestStream/overlapTest.C b/examples/charm++/cuda/overlapTestStream/overlapTest.C
new file mode 100644 (file)
index 0000000..b68368c
--- /dev/null
@@ -0,0 +1,103 @@
+#include "overlapTest.decl.h"
+#include "overlapTest.h"
+
+// #define DEBUG
+
+extern void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C); 
+CProxy_Main mainProxy; 
+int matrixSize;
+
+Main::Main(CkArgMsg *m) {
+  mainProxy = thisProxy; 
+
+  if (m->argc >= 2) {
+    numChares = atoi(m->argv[1]); 
+  }
+  if (m->argc == 3) {
+    matrixSize = atoi(m->argv[2]); 
+  }
+  delete m;
+
+  workers = CProxy_Workers::ckNew(numChares); 
+
+  startTime = CmiWallTimer(); 
+    
+  workers.beginWork(); 
+}
+
+void Main::finishWork(CkReductionMsg *m) {
+  delete m;
+  CkPrintf("Elapsed time: %f s\n", CmiWallTimer() - startTime);  
+  CkExit(); 
+}
+
+Workers::Workers() {
+  int size = matrixSize * matrixSize; 
+  A = new ElementType[size];
+  B = new ElementType[size];
+  C = new ElementType[size]; 
+  
+  randomInit(A, size); 
+  randomInit(B, size); 
+}
+
+Workers::~Workers() {
+  delete [] A; 
+  delete [] B; 
+  delete [] C; 
+}
+
+Workers::Workers(CkMigrateMessage *msg) { } 
+
+void Workers::beginWork() {
+  cudaMatMul(matrixSize, A, B, C);  
+#ifdef DEBUG
+  CkPrintf("[%d] A\n", thisIndex); 
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      CkPrintf("%.2f ", A[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+  CkPrintf("[%d] B\n", thisIndex); 
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      CkPrintf("%.2f ", B[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+  CkPrintf("[%d] C\n", thisIndex); 
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      CkPrintf("%.2f ", C[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+  CkPrintf("[%d] C-gold\n", thisIndex);
+  for (int i=0; i<matrixSize; i++) {
+    CkPrintf("[%d] ", thisIndex);
+    for (int j=0; j<matrixSize; j++) {
+      C[i*matrixSize + j] = 0; 
+      for (int k=0; k<matrixSize; k++) {
+       C[i*matrixSize + j] += A[i*matrixSize +k] * B[k * matrixSize + j];
+      }
+      CkPrintf("%.2f ", C[i*matrixSize+j]); 
+    }
+    CkPrintf("\n");
+  }
+
+#endif
+
+  contribute(CkCallback(CkIndex_Main::finishWork(NULL), mainProxy));
+}
+
+void randomInit(ElementType* data, int size) {
+  for (int i = 0; i < size; ++i) {
+    data[i] = rand() / (ElementType)RAND_MAX;
+  }
+}
+
+#include "overlapTest.def.h"
diff --git a/examples/charm++/cuda/overlapTestStream/overlapTest.ci b/examples/charm++/cuda/overlapTestStream/overlapTest.ci
new file mode 100644 (file)
index 0000000..c625e7c
--- /dev/null
@@ -0,0 +1,16 @@
+mainmodule overlapTest {
+  readonly int matrixSize; 
+  readonly CProxy_Main mainProxy; 
+
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+    entry void finishWork(CkReductionMsg *m); 
+  };
+
+  array [1D] Workers {
+    entry Workers(); 
+    entry void beginWork(); 
+  };
+
+};
+
diff --git a/examples/charm++/cuda/overlapTestStream/overlapTest.cu b/examples/charm++/cuda/overlapTestStream/overlapTest.cu
new file mode 100644 (file)
index 0000000..58ad273
--- /dev/null
@@ -0,0 +1,126 @@
+#include "overlapTestConsts.h"
+
+#if CHECK_BANK_CONFLICTS
+#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
+#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
+#else
+#define AS(i, j) As[i][j]
+#define BS(i, j) Bs[i][j]
+#endif
+
+// matrix multiplication code taken from the CUDA SDK
+
+__global__ void
+matrixMul(float* C, float* A, float* B, int wA, int wB)
+{
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * by;
+
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * bx;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    float Csub = 0;
+
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin;
+             a <= aEnd;
+             a += aStep, b += bStep) {
+
+        // Declaration of the shared memory array As used to
+        // store the sub-matrix of A
+        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+        // Declaration of the shared memory array Bs used to
+        // store the sub-matrix of B
+        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        AS(ty, tx) = A[a + wA * ty + tx];
+        BS(ty, tx) = B[b + wB * ty + tx];
+
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+        for (int k = 0; k < BLOCK_SIZE; ++k)
+            Csub += AS(ty, k) * BS(k, tx);
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+    C[c + wB * ty + tx] = Csub;
+}
+
+void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C) {
+  cudaStream_t stream; 
+  cudaStreamCreate(&stream); 
+  ElementType *h_A, *h_B, *h_C; 
+  ElementType *d_A, *d_B, *d_C;
+  int size = matrixSize * matrixSize * sizeof(ElementType);
+
+  cudaMallocHost((void **) &h_A, size); 
+  cudaMallocHost((void **) &h_B, size); 
+  cudaMallocHost((void **) &h_C, size);  
+
+  cudaMalloc((void **) &d_A, size);
+  cudaMalloc((void **) &d_B, size);
+  cudaMalloc((void **) &d_C, size);
+
+  memcpy(h_A, A, size);
+  memcpy(h_B, B, size); 
+
+  cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream); 
+  cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream); 
+
+  dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
+  dim3 grid(matrixSize / threads.x, matrixSize / threads.y);
+  
+  // execute the kernel
+  matrixMul<<< grid, threads, 0, stream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
+
+  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream); 
+
+  cudaStreamSynchronize(stream); 
+
+  memcpy(C, h_C, size);
+
+  cudaFreeHost(h_A);
+  cudaFreeHost(h_B);
+  cudaFreeHost(h_C);
+  
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+
+  cudaStreamDestroy(stream); 
+}
diff --git a/examples/charm++/cuda/overlapTestStream/overlapTest.h b/examples/charm++/cuda/overlapTestStream/overlapTest.h
new file mode 100644 (file)
index 0000000..be2b8c7
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef __OVERLAP_TEST_H
+#define __OVERLAP_TEST_H
+
+#include "overlapTestConsts.h"
+
+class Main : public CBase_Main {
+ private:
+  CProxy_Workers workers; 
+  int numChares; 
+  double startTime; 
+  
+ public:
+  Main(CkArgMsg *m);
+  void finishWork(CkReductionMsg *m); 
+};
+
+
+class Workers: public CBase_Workers {
+ private:
+  ElementType *A;
+  ElementType *B; 
+  ElementType *C; 
+  
+ public:
+  Workers(); 
+  ~Workers();
+  Workers(CkMigrateMessage *msg);
+  void beginWork(); 
+};
+
+void randomInit(ElementType *data, int size);
+
+#endif
diff --git a/examples/charm++/cuda/overlapTestStream/overlapTestConsts.h b/examples/charm++/cuda/overlapTestStream/overlapTestConsts.h
new file mode 100644 (file)
index 0000000..375dea4
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef __OVERLAP_TEST_CONSTS_H
+#define __OVERLAP_TEST_CONSTS_H
+
+typedef float ElementType; 
+#define BLOCK_SIZE 16
+#define CHECK_BANK_CONFLICTS 0
+
+#endif