Cleaning up charm++ CUDA examples.
authorLukasz Wesolowski <wesolwsk@illinois.edu>
Fri, 6 Jan 2012 04:29:12 +0000 (22:29 -0600)
committerLukasz Wesolowski <wesolwsk@illinois.edu>
Fri, 6 Jan 2012 04:31:22 +0000 (22:31 -0600)
examples/charm++/cuda/overlapTestGPUManager/Makefile [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/Makefile with 65% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.C [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.C with 92% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.ci [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.ci with 100% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.cu [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.cu with 77% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.h [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.h with 100% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTestConsts.h [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTestConsts.h with 89% similarity]
examples/charm++/cuda/overlapTestStream/Makefile
examples/charm++/cuda/overlapTestStream/overlapTest.C
examples/charm++/cuda/overlapTestStream/overlapTest.cu
examples/charm++/cuda/overlapTestStream/overlapTestConsts.h

similarity index 65%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/Makefile
rename to examples/charm++/cuda/overlapTestGPUManager/Makefile
index fa397a1a6d6ba19177108020311aacaa7a1f2c37..50f0e2c1b897286c3a04ad242f04064763f0f4b4 100644 (file)
@@ -1,17 +1,16 @@
-CHARMC=../../../../../net-linux-amd64-cuda/bin/charmc $(OPTS)
+CHARMC=../../../../bin/charmc $(OPTS)
 
 OBJS = overlapTest.o overlapTestCU.o
 
 NVCC = /usr/local/cuda/bin/nvcc 
 NVCC_FLAGS = -O3 -c -use_fast_math #-device-debug -deviceemu 
 NVCC_INC = -I/usr/local/cuda/include  -I../../../../../src/arch/cuda/hybridAPI
-
-export LD_RUN_PATH = /usr/local/cuda/lib64
+CHARMINC = -I../../../../include
 
 all: overlapTest
 
 overlapTest: $(OBJS)
-       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS) -tracemode projections
+       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS)
 
 overlapTest.decl.h: overlapTest.ci
        $(CHARMC)  overlapTest.ci
@@ -23,4 +22,7 @@ overlapTest.o: overlapTest.C overlapTest.decl.h
        $(CHARMC) -O3 -c overlapTest.C
 
 overlapTestCU.o: overlapTest.cu
-       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) -o overlapTestCU.o overlapTest.cu
+       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) $(CHARMINC) -o overlapTestCU.o overlapTest.cu
+
+test: all
+       ./charmrun overlapTest +p2 2 8
\ No newline at end of file
similarity index 92%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.C
rename to examples/charm++/cuda/overlapTestGPUManager/overlapTest.C
index 68830a1b3b19cb201cb7e022dbf8802dcbcebe43..7bdbcd57922ea4a457226833982a8bf6562eb942 100644 (file)
@@ -1,7 +1,7 @@
 #include "overlapTest.decl.h"
 #include "overlapTest.h"
 
-// #define DEBUG
+#define DEBUG
 
 extern void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C, int myIndex, void *cb); 
 extern void hostMemorySetup(int matrixSize, ElementType **h_A, ElementType **h_B, ElementType **h_C, void *cb); 
@@ -72,16 +72,6 @@ void Workers::beginWork() {
 void Workers::complete() {
   int size = matrixSize * matrixSize * sizeof(ElementType); 
   memcpy(C, h_C, size); 
-
-  for (int i=0; i<matrixSize; i++) {
-    for (int j=0; j<matrixSize; j++) {
-      C[i*matrixSize + j] = 0; 
-      for (int k=0; k<matrixSize; k++) {
-       C[i*matrixSize + j] += A[i*matrixSize +k] * B[k * matrixSize + j];
-      }
-    }
-  }
-
 #ifdef DEBUG
   CkPrintf("[%d] A\n", thisIndex); 
   for (int i=0; i<matrixSize; i++) {
similarity index 77%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.cu
rename to examples/charm++/cuda/overlapTestGPUManager/overlapTest.cu
index 4a00f57559aaf7db19f3812dc90409a6f700cec6..746369cd6d96515fc02476e10163e65d3b27a163 100644 (file)
@@ -1,16 +1,9 @@
 #include "overlapTestConsts.h"
 #include "wr.h"
 #include <stdio.h>
+#include <math.h>
 
-#if CHECK_BANK_CONFLICTS
-#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
-#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
-#else
-#define AS(i, j) As[i][j]
-#define BS(i, j) Bs[i][j]
-#endif
-
-// matrix multiplication code taken from the CUDA SDK
+// matrix multiplication code adapted from the CUDA SDK
 
 __global__ void
 matrixMul(float* C, float* A, float* B, int wA, int wB)
@@ -59,8 +52,8 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // Load the matrices from device memory
         // to shared memory; each thread loads
         // one element of each matrix
-        AS(ty, tx) = A[a + wA * ty + tx];
-        BS(ty, tx) = B[b + wB * ty + tx];
+        As[ty][tx] = A[a + wA * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];
 
         // Synchronize to make sure the matrices are loaded
         __syncthreads();
@@ -69,7 +62,7 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // each thread computes one element
         // of the block sub-matrix
         for (int k = 0; k < BLOCK_SIZE; ++k)
-            Csub += AS(ty, k) * BS(k, tx);
+            Csub += As[ty][k] * Bs[k][tx];
 
         // Synchronize to make sure that the preceding
         // computation is done before loading two new
@@ -113,11 +106,6 @@ void hostMemoryCleanup(ElementType *h_A, ElementType *h_B, ElementType *h_C) {
   delayedFree(h_B); 
   delayedFree(h_C);
 
-  /*
-  cudaFreeHost(h_A); 
-  cudaFreeHost(h_B); 
-  cudaFreeHost(h_C); 
-  */
 }
 
 void cudaMatMul(int matrixSize, ElementType *h_A, ElementType *h_B, 
@@ -127,7 +115,8 @@ void cudaMatMul(int matrixSize, ElementType *h_A, ElementType *h_B,
 
   workRequest matmul; 
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-  matmul.dimGrid = dim3(matrixSize / threads.x, matrixSize / threads.y);
+  matmul.dimGrid = dim3( ceil((float)matrixSize / threads.x), 
+                        ceil((float)matrixSize / threads.y) );
   matmul.dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE); 
   matmul.smemSize = 0; 
   matmul.nBuffers = 3; 
@@ -164,43 +153,6 @@ void cudaMatMul(int matrixSize, ElementType *h_A, ElementType *h_B,
   memcpy(matmul.userData, &matrixSize, sizeof(int)); 
 
   enqueue(wrQueue, &matmul); 
-
-  /*
-  cudaStream_t stream; 
-  cudaStreamCreate(&stream); 
-  ElementType *h_A, *h_B, *h_C; 
-  ElementType *d_A, *d_B, *d_C;
-
-
-  cudaMalloc((void **) &d_A, size);
-  cudaMalloc((void **) &d_B, size);
-  cudaMalloc((void **) &d_C, size);
-
-  cudaMemcpyAsync(d_A, A, size, cudaMemcpyHostToDevice, stream); 
-  cudaMemcpyAsync(d_B, B, size, cudaMemcpyHostToDevice, stream); 
-
-  dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-  dim3 grid(matrixSize / threads.x, matrixSize / threads.y);
-  
-  // execute the kernel
-  matrixMul<<< grid, threads, 0, stream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
-
-  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream); 
-
-  memcpy(C, h_C, size);
-
-  cudaStreamSynchronize(stream); 
-
-  cudaFreeHost(h_A);
-  cudaFreeHost(h_B);
-  cudaFreeHost(h_C);
-  
-  cudaFree(d_A);
-  cudaFree(d_B);
-  cudaFree(d_C);
-
-  cudaStreamDestroy(stream); 
-  */
 }
 
 void kernelSelect(workRequest *wr) {
similarity index 89%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTestConsts.h
rename to examples/charm++/cuda/overlapTestGPUManager/overlapTestConsts.h
index b304a88e265b43e3a644d2200de63ccb74d104c6..4e32906b305d024aef21f82c745252f813e1fe09 100644 (file)
@@ -3,7 +3,6 @@
 
 typedef float ElementType; 
 #define BLOCK_SIZE 8
-#define CHECK_BANK_CONFLICTS 0
 #define MATMUL_KERNEL 1000
 
 #define BUFFERS_PER_CHARE 3
index 235a7011b740010b3d80c4b8f7bcde87cb32340f..451dee09668d223b8b0a5efb71f6c2e56fb5cbf0 100644 (file)
@@ -1,19 +1,16 @@
-CHARMC=../../../../net-linux-amd64/bin/charmc $(OPTS)
+CHARMC=../../../../bin/charmc $(OPTS)
 
 OBJS = overlapTest.o overlapTestCU.o
 
 NVCC = /usr/local/cuda/bin/nvcc 
 NVCC_FLAGS = -O3 -c -use_fast_math #-device-debug -deviceemu 
-NVCC_INC = -I/usr/local/cuda/include -I../../../../NVIDIA_CUDA_SDK/common/inc -I../../charm/src/arch/cuda/hybridAPI/
-
-LD_LIBS += -lcuda -lcudart -lGL -lGLU -L../../../../NVIDIA_CUDA_SDK/lib
-
-#export LD_RUN_PATH = /usr/local/cuda/lib
-
+NVCC_INC = -I/usr/local/cuda/include 
+CHARMINC = -I../../../../include
+NVCC_LIBS = -L/usr/local/cuda/lib64 -lcuda -lcudart
 all: overlapTest
 
 overlapTest: $(OBJS)
-       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS) -tracemode projections
+       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(NVCC_LIBS) 
 
 overlapTest.decl.h: overlapTest.ci
        $(CHARMC)  overlapTest.ci
@@ -25,4 +22,7 @@ overlapTest.o: overlapTest.C overlapTest.decl.h
        $(CHARMC) -O3 -c overlapTest.C
 
 overlapTestCU.o: overlapTest.cu
-       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) -o overlapTestCU.o overlapTest.cu
+       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) $(CHARMINC) -o overlapTestCU.o overlapTest.cu
+
+test: all
+       ./charmrun overlapTest +p2 2 8 
\ No newline at end of file
index b68368c1f3e0165cece41ff1c30c32336a51d069..a8a4d8ea058c193196a7fdb69f62e31165552dbf 100644 (file)
@@ -1,7 +1,7 @@
 #include "overlapTest.decl.h"
 #include "overlapTest.h"
 
-// #define DEBUG
+#define DEBUG
 
 extern void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C); 
 CProxy_Main mainProxy; 
index 58ad273b9ba37bca70b7b4502f5bd12c0220b629..dfd5ec308947d71c5c58db6a622545ad31573c1a 100644 (file)
@@ -1,13 +1,5 @@
 #include "overlapTestConsts.h"
 
-#if CHECK_BANK_CONFLICTS
-#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
-#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
-#else
-#define AS(i, j) As[i][j]
-#define BS(i, j) Bs[i][j]
-#endif
-
 // matrix multiplication code taken from the CUDA SDK
 
 __global__ void
@@ -57,8 +49,8 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // Load the matrices from device memory
         // to shared memory; each thread loads
         // one element of each matrix
-        AS(ty, tx) = A[a + wA * ty + tx];
-        BS(ty, tx) = B[b + wB * ty + tx];
+        As[ty][tx] = A[a + wA * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];
 
         // Synchronize to make sure the matrices are loaded
         __syncthreads();
@@ -67,7 +59,7 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // each thread computes one element
         // of the block sub-matrix
         for (int k = 0; k < BLOCK_SIZE; ++k)
-            Csub += AS(ty, k) * BS(k, tx);
+            Csub += As[ty][k] * Bs[k][tx];
 
         // Synchronize to make sure that the preceding
         // computation is done before loading two new
@@ -82,8 +74,8 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
 }
 
 void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C) {
-  cudaStream_t stream; 
-  cudaStreamCreate(&stream); 
+  cudaStream_t matMulStream; 
+  cudaStreamCreate(&matMulStream); 
   ElementType *h_A, *h_B, *h_C; 
   ElementType *d_A, *d_B, *d_C;
   int size = matrixSize * matrixSize * sizeof(ElementType);
@@ -99,18 +91,18 @@ void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C)
   memcpy(h_A, A, size);
   memcpy(h_B, B, size); 
 
-  cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream); 
-  cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream); 
+  cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, matMulStream); 
+  cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, matMulStream); 
 
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
   dim3 grid(matrixSize / threads.x, matrixSize / threads.y);
   
   // execute the kernel
-  matrixMul<<< grid, threads, 0, stream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
+  matrixMul<<< grid, threads, 0, matMulStream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
 
-  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream); 
+  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, matMulStream); 
 
-  cudaStreamSynchronize(stream); 
+  cudaStreamSynchronize(matMulStream); 
 
   memcpy(C, h_C, size);
 
@@ -122,5 +114,5 @@ void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C)
   cudaFree(d_B);
   cudaFree(d_C);
 
-  cudaStreamDestroy(stream); 
+  cudaStreamDestroy(matMulStream); 
 }
index 375dea4dfe3ce37a1d30c392731d78935092d4c5..76d4f2afc8f8100c9c9aea6e1bec782cb2b52ac4 100644 (file)
@@ -2,7 +2,6 @@
 #define __OVERLAP_TEST_CONSTS_H
 
 typedef float ElementType; 
-#define BLOCK_SIZE 16
-#define CHECK_BANK_CONFLICTS 0
+#define BLOCK_SIZE 8
 
 #endif