Renamed NodeHelper library to CkLoop library, and changed examples correspondingly
authorChao Mei <chaomei2@illinois.edu>
Tue, 7 Aug 2012 04:52:16 +0000 (23:52 -0500)
committerChao Mei <chaomei2@illinois.edu>
Tue, 7 Aug 2012 04:52:16 +0000 (23:52 -0500)
20 files changed:
examples/charm++/ckloop/fft-trans/Makefile [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/README.cklooplib [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/fft1d.C [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/fft1d.ci [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/fft_bench.cpp [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/fft_ref.cpp [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/fftmacro.h [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/fileio.h [new file with mode: 0644]
examples/charm++/ckloop/fft-trans/test [new file with mode: 0755]
examples/charm++/ckloop/fft-trans/test.sh [new file with mode: 0755]
examples/charm++/ckloop/simpleLoopBench/Makefile [new file with mode: 0644]
examples/charm++/ckloop/simpleLoopBench/hello.C [new file with mode: 0644]
examples/charm++/ckloop/simpleLoopBench/hello.ci [new file with mode: 0644]
examples/charm++/ckloop/simpleLoopBench/hello.h [new file with mode: 0644]
src/libs/ck-libs/ckloop/CkLoop.C [new file with mode: 0644]
src/libs/ck-libs/ckloop/CkLoop.ci [new file with mode: 0644]
src/libs/ck-libs/ckloop/CkLoop.h [new file with mode: 0644]
src/libs/ck-libs/ckloop/CkLoopAPI.h [new file with mode: 0644]
src/libs/ck-libs/ckloop/Make.depends [new file with mode: 0644]
src/libs/ck-libs/ckloop/Makefile [new file with mode: 0644]

diff --git a/examples/charm++/ckloop/fft-trans/Makefile b/examples/charm++/ckloop/fft-trans/Makefile
new file mode 100644 (file)
index 0000000..4d3e5dc
--- /dev/null
@@ -0,0 +1,60 @@
+USEROPTS       = -O3
+CHARMDIR=../../../..
+
+CHARMINC = $(CHARMDIR)/include
+CHARMLIB = $(CHARMDIR)/lib
+OPTS = -I$(CHARMINC) $(USEROPTS)
+CHARMC = $(CHARMDIR)/bin/charmc $(OPTS)
+
+CC=mpicxx
+LIBS = -lfftw3 -lm
+SLIBS = -lfftw3f -lm
+
+OBJS = fft1d.o
+
+#all: fft1d fft_ref fft1d.prj fft_bench
+all: fft1d fft1d.prj fft_bench
+
+#for single precision add -DSINGLE_PRECISION to OPTS on the command line
+singlePrecision: fft1d_s
+
+
+fft_bench: fft_bench.o
+       ${CC} fft_bench.o -o fft_bench $(LIBS)
+
+fft_bench.o: fft_bench.cpp
+       ${CC} -c fft_bench.cpp $(INC)
+
+fft1d: $(OBJS)
+       $(CHARMC) -language charm++ -o fft1d $(OBJS) $(LIBS) -module CkLoop
+
+fft1d_s: $(OBJS)
+       $(CHARMC) -language charm++ -o fft1d_s $(OBJS) $(SLIBS)  -module CkLoop
+
+projections: fft1d.prj
+fft1d.prj: $(OBJS)
+       $(CHARMC) -language charm++ -tracemode projections $(LIBS) -lz -o fft1d.prj $(OBJS)  -module CkLoop
+
+summary: $(OBJS)
+       $(CHARMC) -language charm++ -tracemode summary $(LIBS) -o fft1d.sum $(OBJS)
+
+fft1d.decl.h: fft1d.ci
+       $(CHARMC)  fft1d.ci
+
+fft_ref: fft_ref.o
+       ${CC} fft_ref.o -o fft_ref -lfftw3 -lm
+#      ${CC} fft_ref.o -o fft_ref -L/expand/home/arya/fftw-3.3/lib -lfftw3_mpi -lfftw3 -lm
+
+fft_ref.o: fft_ref.cpp
+       ${CC} -c fft_ref.cpp 
+#      ${CC} -c fft_ref.cpp -I/expand/home/arya/fftw-3.3/include
+
+cleanproj:
+       rm -f *.log *.sts *.projrc
+
+clean:
+       rm -f *.decl.h *.def.h conv-host *.o fft1d fft1d.prj fft_bench charmrun fft_ref *~
+
+fft1d.o: fft1d.C fft1d.decl.h
+       $(CHARMC) -c fft1d.C
+
diff --git a/examples/charm++/ckloop/fft-trans/README.cklooplib b/examples/charm++/ckloop/fft-trans/README.cklooplib
new file mode 100644 (file)
index 0000000..322d46a
--- /dev/null
@@ -0,0 +1,21 @@
+Simple test application to try node level parallelization of FFT using
+threads in the CkLoop library compared to a standard charm level
+decomposition.
+
+The main distinction here is node level shared memory awareness.  The
+CkLoop library will exploit that whereas the charm decomposition
+will not.  
+
+We are also interested in getting a sense of at what level of
+granularity it is worth trying to do node level parallelism.  It is
+expected that a lower bound on practically parallelizable grainsize
+will exist as determined by cache locality and function call overhead.
+Below that threshold it won't be worth trying to explicitely exploit
+the available data parallelism.  Hopefully this number is larger than
+a few fft lines with tens to hundreds of points.
+
+For simplicity we do a single FFT for complex to complex using single
+precision as this is typical for NAMD.  A more advanced version would
+do real to complex, but that is more pain than necessary to
+demonstrate utility.
+
diff --git a/examples/charm++/ckloop/fft-trans/fft1d.C b/examples/charm++/ckloop/fft-trans/fft1d.C
new file mode 100644 (file)
index 0000000..a5c7192
--- /dev/null
@@ -0,0 +1,289 @@
+#include "fft1d.decl.h"
+#include <fftw3.h>
+#include <limits>
+#include "fileio.h"
+#include "CkLoopAPI.h"
+
+#define TWOPI 6.283185307179586
+
+/*readonly*/ CProxy_Main mainProxy;
+/*readonly*/ int numChunks;
+/*readonly*/ int numTasks;
+/*readonly*/ uint64_t N;
+static CmiNodeLock fft_plan_lock;
+#include "fftmacro.h"
+
+CProxy_FuncCkLoop ckLoopProxy;
+/** called by initnode once per node to support node level locking for
+    fftw plan create/destroy operations */
+#define CKLOOP_MODE CKLOOP_STATIC 
+
+extern "C" void doCalc(int first,int last, void *result, int paramNum, void * param)
+{
+  //result=first;
+  for(int i=first; i<=last; i++)
+    fft_execute(((fft_plan*)param)[i]);
+}
+
+void initplanlock ()
+
+{
+  fft_plan_lock=CmiCreateLock();
+}
+
+struct fftMsg : public CMessage_fftMsg {
+  int source;
+  fft_complex *data;
+};
+
+struct Main : public CBase_Main {
+  double start;
+  CProxy_fft fftProxy;
+
+  Main(CkArgMsg* m) {
+    numChunks = atoi(m->argv[1]); //#1D partitions
+    N = atol(m->argv[2]); //matrix size
+    if(m->argc>=4)
+      numTasks = atol(m->argv[3]); //the number of tasks that 1D partition is splitted into
+    else
+      numTasks = CmiMyNodeSize();  //default to 1/core
+    delete m;
+    
+    mainProxy = thisProxy;
+
+    /* how to nodify this computation? */
+    /* We make one block alloc per chare and divide the work evenly
+       across the number of threads.
+     * cache locality issues... 
+
+     *       The CkLoop scheme presents a problem in cache
+     *       ignorance.  We push tasks into the queue as the remote
+     *       message dependencies are met, however the execution of
+     *       dequeued tasks performance will have significant cache
+     *       limitations obfuscated to our scheduler.  Our helper
+     *       threads will block while fetching data into cache local
+     *       to the thread.  If we only have 1 thread per core, we
+     *       have no way to self overlap those operations.  This
+     *       implies that there are probably conditions under which
+     *       more than one ckloop thread per core will result in
+     *       better performance.  A natural sweet spot for these
+     *       should be explored in the SMT case where one thread per
+     *       SMT will allow for natural overlap of execution based on
+     *       cache availability, as controlled by the OS without
+     *       additional pthread context switching overhead.  A further
+     *       runtime based virtualized overthreading may provide
+     *       further benefits depending on thread overhead.
+     */
+    if (N % numChunks != 0)
+      CkAbort("numChunks not a factor of N\n");
+
+    // Construct an array of fft chares to do the calculation
+    fftProxy = CProxy_fft::ckNew(numChunks);
+
+    // Construct a ckloop to do the calculation
+    //ckLoopProxy = CkLoop_Init(CKLOOP_MODE, numTasks);
+    ckLoopProxy = CkLoop_Init();
+    
+    CkStartQD(CkIndex_Main::initDone((CkQdMsg *)0), &thishandle);
+  }
+
+  void initDone(CkQdMsg *msg){
+    delete msg;
+    startFFT();
+  }
+  
+  void startFFT() {
+    start = CkWallTimer();
+    // Broadcast the 'go' signal to the fft chare array
+    fftProxy.doFFT();
+  }
+
+  void doneFFT() {
+    double time = CkWallTimer() - start;
+    double gflops = 5 * (double)N*N * log2((double)N*N) / (time * 1000000000);
+    CkPrintf("chares: %d\ncores: %d\nTasks: %d\nsize: %ld\ntime: %f sec\nrate: %f GFlop/s\n",
+             numChunks, CkNumPes(), numTasks, N*N, time, gflops);
+
+    fftProxy.initValidation();
+  }
+
+  void printResidual(realType r) {
+    CkPrintf("residual = %g\n", r);
+    CkExit();
+  }
+
+};
+
+struct fft : public CBase_fft {
+  fft_SDAG_CODE
+
+  int iteration, count;
+  uint64_t n;
+  fft_plan *plan;
+  fft_plan p1;
+  fftMsg **msgs;
+  fft_complex *in, *out;
+  bool validating;
+  int nPerThread;
+  fft() {
+    __sdag_init();
+
+    validating = false;
+
+    n = N*N/numChunks;
+
+    in = (fft_complex*) fft_malloc(sizeof(fft_complex) * n);
+    out = (fft_complex*) fft_malloc(sizeof(fft_complex) * n);
+    nPerThread= n/numTasks;
+    
+    int length[] = {N};
+    
+    /** Basically, we want to parallelize the following fftw function call
+     * which is to do fft on each row of a 2D array with #rows=N/numChunks, #cols=N;
+     * 1. create a plan: singlePlan = fft_plan_many_dft(1, len, N/numChunks, out, len,
+     *                                                  1, N, out, len, 1,
+     *                                                  N, FFTW_FORWARD, FFTW_ESTIMATE)
+     * where len is defined as int len[]={N}
+     * 2. execute the plan: fft_execute(singlePlan).
+     * 
+     * It's not a loop, we transformed it into a loop with N/numTasks plans so that 
+     * each task execute one plan. Each plan has N/numChunks/numTasks rows for fftw
+     * processing.
+     */
+    
+    CmiLock(fft_plan_lock);
+    size_t offset=0;
+    plan= new fft_plan[numTasks];
+    for(int i=0; i < numTasks; i++,offset+=nPerThread)
+      {
+       /* ??? should the dist be nPerThread as the fft is performed as 1d of length nPerThread?? */
+       //plan[i] = fft_plan_many_dft(1, length, N/numChunks/numTasks, out+offset, length, 1, N/numTasks,
+    //                        out+offset, length, 1, N/numTasks, FFTW_FORWARD, FFTW_ESTIMATE);
+    
+    plan[i] = fft_plan_many_dft(1, length, N/numChunks/numTasks, out+offset, length, 1, N,
+                            out+offset, length, 1, N, FFTW_FORWARD, FFTW_ESTIMATE);                        
+      }
+    CmiUnlock(fft_plan_lock);
+    
+    srand48(thisIndex);
+    for(int i = 0; i < n; i++) {
+      in[i][0] = drand48();
+      in[i][1] = drand48();
+    }
+
+    msgs = new fftMsg*[numChunks];
+    for(int i = 0; i < numChunks; i++) {
+      msgs[i] = new (n/numChunks) fftMsg;
+      msgs[i]->source = thisIndex;
+    }
+
+    // Reduction to the mainchare to signal that initialization is complete
+    //contribute(CkCallback(CkIndex_Main::startFFT(), mainProxy));
+  }
+
+  void sendTranspose(fft_complex *src_buf) {
+    // All-to-all transpose by constructing and sending
+    // point-to-point messages to each chare in the array.
+    for(int i = thisIndex; i < thisIndex+numChunks; i++) {
+      //  Stagger communication order to avoid hotspots and the
+      //  associated contention.
+      int k = i % numChunks;
+      for(int j = 0, l = 0; j < N/numChunks; j++)
+        memcpy(msgs[k]->data[(l++)*N/numChunks], src_buf[k*N/numChunks+j*N], sizeof(fft_complex)*N/numChunks);
+
+      // Tag each message with the iteration in which it was
+      // generated, to prevent mis-matched messages from chares that
+      // got all of their input quickly and moved to the next step.
+      CkSetRefNum(msgs[k], iteration);
+      thisProxy[k].getTranspose(msgs[k]);
+      // Runtime system takes ownership of messages once they're sent
+      msgs[k] = NULL;
+    }
+  }
+
+  void applyTranspose(fftMsg *m) {
+    int k = m->source;
+    for(int j = 0, l = 0; j < N/numChunks; j++)
+      for(int i = 0; i < N/numChunks; i++) {
+        out[k*N/numChunks+(i*N+j)][0] = m->data[l][0];
+        out[k*N/numChunks+(i*N+j)][1] = m->data[l++][1];
+      }
+
+    // Save just-received messages to reuse for later sends, to
+    // avoid reallocation
+    delete msgs[k];
+    msgs[k] = m;
+    msgs[k]->source = thisIndex;
+  }
+
+  void twiddle(realType sign) {
+    realType a, c, s, re, im;
+
+    int k = thisIndex;
+    for(int i = 0; i < N/numChunks; i++)
+      for(int j = 0; j < N; j++) {
+        a = sign * (TWOPI*(i+k*N/numChunks)*j)/(N*N);
+        c = cos(a);
+        s = sin(a);
+
+        int idx = i*N+j;
+
+        re = c*out[idx][0] - s*out[idx][1];
+        im = s*out[idx][0] + c*out[idx][1];
+        out[idx][0] = re;
+        out[idx][1] = im;
+      }
+  }
+  void fftHelperLaunch()
+  {
+    //kick off thread computation
+    //FuncCkLoop *nth = ckLoopProxy[CkMyNode()].ckLocalBranch();
+    //nth->parallelizeFunc(doCalc, numTasks, numTasks, thisIndex, numTasks, 1, 1, plan, 0, NULL);
+    double ffttime = CkWallTimer();
+    CkLoop_Parallelize(doCalc, 1, plan, numTasks, 0, numTasks-1);    
+    CkPrintf("FFT time: %.3f (ms)\n", (CkWallTimer()-ffttime)*1e3);
+  }
+
+  void initValidation() {
+    memcpy(in, out, sizeof(fft_complex) * n);
+
+    validating = true;
+    int length[] = {N};
+    CmiLock(fft_plan_lock);
+    size_t offset=0;
+    plan= new fft_plan[numTasks];
+    for(int i=0; i < numTasks; i++,offset+=nPerThread)
+      {
+       //      fft_destroy_plan(plan[i]);
+       //plan[i] = fft_plan_many_dft(1, length, N/numChunks/numTasks, out+offset, length, 1, N/numTasks,
+    //                        out+offset, length, 1, N/numTasks, FFTW_BACKWARD, FFTW_ESTIMATE);
+    plan[i] = fft_plan_many_dft(1, length, N/numChunks/numTasks, out+offset, length, 1, N,
+                            out+offset, length, 1, N, FFTW_BACKWARD, FFTW_ESTIMATE);
+      }
+    CmiUnlock(fft_plan_lock);
+    contribute(CkCallback(CkIndex_Main::startFFT(), mainProxy));
+  }
+
+  void calcResidual() {
+    double infNorm = 0.0;
+
+    srand48(thisIndex);
+    for(int i = 0; i < n; i++) {
+      out[i][0] = out[i][0]/(N*N) - drand48();
+      out[i][1] = out[i][1]/(N*N) - drand48();
+
+      double mag = sqrt(pow(out[i][0], 2) + pow(out[i][1], 2));
+      if(mag > infNorm) infNorm = mag;
+    }
+
+    double r = infNorm / (std::numeric_limits<double>::epsilon() * log((double)N * N));
+
+    CkCallback cb(CkReductionTarget(Main, printResidual), mainProxy);
+    contribute(sizeof(double), &r, CkReduction::max_double, cb);
+  }
+
+  fft(CkMigrateMessage* m) {}
+  ~fft() {}
+};
+
+#include "fft1d.def.h"
diff --git a/examples/charm++/ckloop/fft-trans/fft1d.ci b/examples/charm++/ckloop/fft-trans/fft1d.ci
new file mode 100644 (file)
index 0000000..60ba606
--- /dev/null
@@ -0,0 +1,56 @@
+mainmodule fft1d {
+  include "fftmacro.h";
+  readonly CProxy_Main mainProxy;
+  readonly CProxy_FuncCkLoop ckLoopProxy;
+  readonly int numChunks;
+  readonly int numTasks;
+  readonly uint64_t N;
+  initnode void initplanlock();
+  message fftMsg {
+    fft_complex data[];
+  };
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+    entry void initDone(CkQdMsg *m);
+    entry void startFFT();
+    entry void doneFFT();
+
+    entry [reductiontarget] void printResidual(realType residual);
+  };
+
+  array [1D] fft {
+    entry fft();
+    entry void getTranspose(fftMsg *m);
+    entry void initValidation();
+    entry void doFFT() {
+      for(iteration = 0; iteration < 3; ++iteration) {
+        atomic "transpose" {
+          if(thisIndex == 0)
+            CkPrintf("TRANSPOSING\n");
+          sendTranspose(iteration == 0 ? in : out);
+        }
+
+        for(count = 0; count < numChunks; ++count)
+          when getTranspose[iteration] (fftMsg *m) atomic {
+            applyTranspose(m);
+          }
+
+        if (iteration < 2) atomic "compute" {
+         fftHelperLaunch();
+          if(iteration == 0)
+              twiddle(validating ? 1 : -1);
+        }
+      }        
+      atomic {
+        if(!validating)
+          contribute(CkCallback(CkIndex_Main::doneFFT(), mainProxy));
+        else {
+          char filename[80];
+          sprintf(filename, "%d-%ld.dump%d", numChunks, N, thisIndex);
+          writeCommFile(n, in, filename);
+          calcResidual();
+        }
+      }
+    };
+  };
+};
diff --git a/examples/charm++/ckloop/fft-trans/fft_bench.cpp b/examples/charm++/ckloop/fft-trans/fft_bench.cpp
new file mode 100644 (file)
index 0000000..240625b
--- /dev/null
@@ -0,0 +1,67 @@
+//#include <stdio.h>
+//#include <iostream>
+
+#include <mpi.h>
+#include <fftw3.h>
+#include <cstdlib>
+#include <limits>
+#include <cmath>
+
+using namespace std;
+
+int main(int argc, char *argv[]) 
+{
+  MPI_Init(&argc, &argv);
+
+  long N = atol(argv[1]);
+  long n = N * N;
+
+  fftw_complex *data;
+  fftw_plan p, q;
+
+  data = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+
+  int length[] = {N};
+  p = fftw_plan_many_dft(1, length, N, data, length, 1, N,
+      data, length, 1, N, FFTW_FORWARD, FFTW_ESTIMATE); 
+  q = fftw_plan_many_dft(1, length, N, data, length, 1, N,
+      data, length, 1, N, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+  srand48(0);
+  for(int i = 0; i<n; i++) {
+    data[i][0] = drand48();
+    data[i][1] = drand48();
+  }
+
+  double t1, t2; 
+  t1 = MPI_Wtime(); 
+
+  fftw_execute(p);
+
+  t2 = MPI_Wtime() - t1; 
+  printf( "On %ld elements, Elapsed time is %f with %f Gflop/s\n", n, t2, 5*(double)n*log2((double)n)/(t2*1000000000)); 
+
+  fftw_execute(q);
+
+  double infNorm = 0.0;
+
+  srand48(0);
+  for(int i=0; i<n; i++) {
+    data[i][0] = data[i][0]/N - drand48();
+    data[i][1] = data[i][1]/N - drand48();
+
+    double mag = sqrt(pow(data[i][0],2) + pow(data[i][1], 2));
+    if(mag > infNorm) infNorm = mag;
+  } 
+
+  double r = infNorm / (std::numeric_limits<double>::epsilon() * log((double)n));
+
+  printf("residual = %f\n", r);
+
+  fftw_destroy_plan(p);
+  fftw_destroy_plan(q);
+  fftw_free(data);
+
+  MPI_Finalize();
+  return 0;
+}
diff --git a/examples/charm++/ckloop/fft-trans/fft_ref.cpp b/examples/charm++/ckloop/fft-trans/fft_ref.cpp
new file mode 100644 (file)
index 0000000..f3603cc
--- /dev/null
@@ -0,0 +1,76 @@
+#include <mpi.h> 
+#include <cstdlib>
+#include "fftw3-mpi.h"
+#include <cmath>
+#include <limits>
+#include "fileio.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+  int rank, size; 
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
+
+  fftw_plan plan;
+  fftw_complex *data;
+
+  fftw_mpi_init();
+
+  if(rank == 0) {
+    if(argc != 2) {
+      printf("Usage: ./binary <N>\n");
+      MPI_Abort(MPI_COMM_WORLD,-1);
+    }
+  }
+
+  int N = atoi(argv[1]);
+
+  ptrdiff_t local_ni=N*N/size, local_i_start = N*N/size*rank;
+  ptrdiff_t local_no=local_ni, local_o_start = local_i_start;
+
+  int b_or_f = FFTW_BACKWARD;
+
+  ptrdiff_t alloc_local =  fftw_mpi_local_size_1d(N*N, MPI_COMM_WORLD,
+      b_or_f, FFTW_ESTIMATE, &local_ni, &local_i_start,
+      &local_no, &local_o_start);
+
+  data = fftw_alloc_complex(alloc_local);
+
+  plan = fftw_mpi_plan_dft_1d(N*N, data, data, MPI_COMM_WORLD, b_or_f, FFTW_ESTIMATE);
+
+  char filename[80];
+  sprintf(filename, "%d-%d.dump%d", size, N, rank);
+  readCommFile(data, filename);
+
+  fftw_execute(plan);
+
+  double infNorm = 0.0;
+  srand48(rank);
+  for(int i = 0; i < N*N/size; i++) {
+    data[i][0] = data[i][0]/(N*N) - drand48();
+    data[i][1] = data[i][1]/(N*N) - drand48();
+
+    double mag = sqrt(pow(data[i][0], 2) + pow(data[i][1], 2));
+    if(mag > infNorm) infNorm = mag;
+  }
+
+  double my_r = infNorm / (std::numeric_limits<double>::epsilon() * log((double)N * N));
+  double r;
+
+  MPI_Reduce(&my_r, &r, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+
+  if(rank == 0) {
+    if(r < 16)
+      printf("r = %g, PASS!\n",r);
+    else
+      printf("r = %g, FAIL\n",r);
+  }
+
+  fftw_destroy_plan(plan);
+
+  fftw_mpi_cleanup();
+  MPI_Finalize();      
+  return 0; 
+} 
diff --git a/examples/charm++/ckloop/fft-trans/fftmacro.h b/examples/charm++/ckloop/fft-trans/fftmacro.h
new file mode 100644 (file)
index 0000000..b485067
--- /dev/null
@@ -0,0 +1,20 @@
+//todo fix this is a more clever macro
+#ifdef SINGLE_PRECISION
+#define fft_complex fftwf_complex
+#define fft_malloc fftwf_malloc
+#define fft_free fftwf_free
+#define fft_plan_many_dft fftwf_plan_many_dft
+#define fft_destroy_plan fftwf_destroy_plan
+#define fft_execute fftwf_execute
+#define fft_plan fftwf_plan
+#define realType float
+#else
+#define fft_complex fftw_complex
+#define fft_malloc fftw_malloc
+#define fft_free fftw_free
+#define fft_plan_many_dft fftw_plan_many_dft
+#define fft_destroy_plan fftw_destroy_plan
+#define fft_execute fftw_execute
+#define fft_plan fftw_plan
+#define realType double
+#endif
diff --git a/examples/charm++/ckloop/fft-trans/fileio.h b/examples/charm++/ckloop/fft-trans/fileio.h
new file mode 100644 (file)
index 0000000..35b4c38
--- /dev/null
@@ -0,0 +1,30 @@
+#include "fftmacro.h"
+void readCommFile(fft_complex *data, char *filename) {
+  FILE *pFile;
+  if(!(pFile = fopen (filename,"r"))) {
+    printf("File open failed\n");
+    return;
+  }
+
+  int l = 0;
+#ifdef SINGLE_PRECISION
+  while(fscanf (pFile, "%f %f", &data[l][0], &data[l][1]) != EOF) {l++;}
+#else
+  while(fscanf (pFile, "%lf %lf", &data[l][0], &data[l][1]) != EOF) {l++;}
+#endif
+
+  fclose(pFile);
+}
+
+void writeCommFile(int n, fft_complex *data, char *filename) {
+  FILE *pFile;
+  if(!(pFile = fopen (filename, "w"))) {
+    printf("File open for write failed\n");
+    return;
+  }
+
+  for(int l = 0; l < n; l++)
+
+    fprintf(pFile, "%.24f %.24f\n", data[l][0], data[l][1]);
+  fclose(pFile);
+}
diff --git a/examples/charm++/ckloop/fft-trans/test b/examples/charm++/ckloop/fft-trans/test
new file mode 100755 (executable)
index 0000000..04f5766
--- /dev/null
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+./charmrun +p4 ++local fft1d $1 $2
+mpirun -np $1 ./fft_ref $2
diff --git a/examples/charm++/ckloop/fft-trans/test.sh b/examples/charm++/ckloop/fft-trans/test.sh
new file mode 100755 (executable)
index 0000000..04f5766
--- /dev/null
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+./charmrun +p4 ++local fft1d $1 $2
+mpirun -np $1 ./fft_ref $2
diff --git a/examples/charm++/ckloop/simpleLoopBench/Makefile b/examples/charm++/ckloop/simpleLoopBench/Makefile
new file mode 100644 (file)
index 0000000..1475626
--- /dev/null
@@ -0,0 +1,43 @@
+#NOTE: to compile the example, the CkLoop.decl/def.h should exist
+USEROPTS=-O3 -g -lpthread -fopenmp
+CHARMDIR=../../../..
+CHARMINC=$(CHARMDIR)/include
+OPTS=-I$(CHARMINC) $(USEROPTS)
+CHARMC=$(CHARMDIR)/bin/charmc $(OPTS)
+CHARMLIB=$(CHARMDIR)/lib
+
+
+all: hello
+
+hello: hello.o
+       $(CHARMC) -language charm++ -o hello hello.o  -module CkLoop
+       $(CHARMC) -language charm++ -o hello.prj hello.o  -tracemode projections -module CkLoop
+
+hello.decl.h: hello.ci
+       $(CHARMC)  hello.ci
+
+hello.o: hello.C hello.decl.h 
+       $(CHARMC) -c hello.C
+
+clean:
+       rm -f *.decl.h *.def.h conv-host *.o hello hello.prj charmrun *.log *.sum *.sts
+
+test: all
+       ./charmrun ./hello +p4 10
+
+bgtest: all
+       ./charmrun ./hello +p4 10 +x2 +y2 +z2 +cth1 +wth1
+
+module: $(CHARMLIB)/libmoduleCkLoop.a
+
+$(CHARMLIB)/libmoduleCkLoop.a: CkLoop.o
+       $(CHARMC)  -o $(CHARMLIB)/libmoduleCkLoop.a CkLoop.o
+
+
+CkLoop.decl.h: CkLoop.ci
+       $(CHARMC)  CkLoop.ci
+
+
+CkLoop.o: CkLoop.C CkLoop.decl.h
+       $(CHARMC) -c CkLoop.C
+
diff --git a/examples/charm++/ckloop/simpleLoopBench/hello.C b/examples/charm++/ckloop/simpleLoopBench/hello.C
new file mode 100644 (file)
index 0000000..6862fa3
--- /dev/null
@@ -0,0 +1,261 @@
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "hello.h"
+
+#include "hello.decl.h"
+
+#include <omp.h>
+
+#define TEST_REPEAT_TIMES 100
+
+CProxy_Main mainProxy;
+CProxy_TestInstance allTestsProxy;
+CProxy_FuncCkLoop ckLoopProxy;
+int totalElems; //the number of test instances
+int loopTimes;
+int numChunks;
+int runningMode;
+
+int threadNum; //number of threads to be used in non-SMP
+
+int cmpDFunc(const void *a, const void *b) {
+    double n1 = *(double *)a;
+    double n2 = *(double *)b;
+    if (n1<n2) return -1;
+    if (n1>n2) return 1;
+    return 0;
+}
+
+void work(int start, int end, void *result) {
+    int tmp=0;
+    for (int i=start; i<=end; i++) {
+        tmp+=(int)(sqrt(1+cos(i*1.57)));
+    }
+    *(int *)result = tmp;
+    
+   //CkPrintf("From rank[%d]: start=%d, end=%d, result=%d\n", CkMyRank(), start, end, tmp);
+   //fflush(stdout);
+}
+
+int openMPWork(int start, int end) {
+    int result = 0;
+    
+    #pragma omp parallel for reduction (+:result)
+    for(int i=start; i<=end; i++) {
+        result += (int)(sqrt(1+cos(i*1.57)));
+    }
+    
+    return result;
+}
+
+extern "C" void doCalc(int first, int last, void *result, int paramNum, void * param) {    
+    //double tstart = CkWallTimer();
+    
+       work(first, last, result);
+    
+       //tstart = CkWallTimer() - tstart;
+    //printf("start=%d, end=%d, time: %f,result: %d on proc %d\n",first, last, tstart,result,CkMyPe());
+}
+
+/*mainchare*/
+Main::Main(CkArgMsg* m) {
+    
+       //default values        
+    totalElems = 1;
+       numChunks = CkMyNodeSize();
+       loopTimes = 1000;
+       runningMode = CKLOOP_USECHARM; 
+       
+    mainStep = 0;
+       numElemFinished = 0;
+       
+    curTestMode = 0;
+    
+    //process command line
+    if (m->argc >1 ){
+        processCommandLine(m->argc,m->argv);
+       }
+    else{              
+               CkPrintf("Usage: -t(#iterations) -c(#chunks) -a(#test instances) -m(running mode, 1 for use Charm threads; 2 for use pthreads )  -p(#threads)\n");
+       }
+    delete m;
+       
+    omp_set_num_threads(numChunks);    
+    
+       mainTimes = new double[TEST_REPEAT_TIMES];
+       memset(mainTimes, 0, sizeof(double)*TEST_REPEAT_TIMES);
+       
+       CkPrintf("Using CkLoop Lib with mode: %d, nodesize=%d\n", runningMode, CkMyNodeSize());
+       CkPrintf("Testcase info: %d test instances where the loop iterates %d times, each work is partitioned into %d tasks\n", totalElems, loopTimes, numChunks);
+       
+       ckLoopProxy = CkLoop_Init(runningMode, threadNum);
+       //ckLoopProxy = CkLoop_Init();
+    mainProxy = thishandle;
+    
+       //create test instances
+    CProxy_cyclicMap myMap = CProxy_cyclicMap::ckNew();
+    CkArrayOptions opts(totalElems);
+    opts.setMap(myMap);
+    allTestsProxy = CProxy_TestInstance::ckNew(opts);
+
+    //serial version
+       int result;
+       double starttime, endtime;
+       for(int i=0; i<3; i++){
+               starttime = CkWallTimer();
+               work(0, loopTimes, &result);            
+               endtime = CkWallTimer();
+               CkPrintf("Calibration %d: the loop takes %.3f us with result %d\n", i+1,  (endtime-starttime)*1e6, result);
+       }
+       int results[5];
+       starttime = CkWallTimer();
+       for(int i=0; i<5; i++) work(0, loopTimes, results+i);
+       endtime = CkWallTimer();
+       double avgtime = (endtime-starttime)*1e6/5; //in the unit of us
+       CkPrintf("Calibration: avg time %.3f us of 5 consecutive runs, so a 100us-loop will iterate %d times\n", avgtime, (int)(loopTimes*100.0/avgtime));
+               
+    CmiSetCPUAffinity(0);
+    CkStartQD(CkIndex_Main::doTests((CkQdMsg *)0), &thishandle);
+};
+
+void Main::done(void) {
+    numElemFinished++;
+    if (numElemFinished < totalElems) {
+        return;
+    } else {
+               mainTimes[mainStep] = (CkWallTimer() - timestamp)*1e6;
+        mainStep++;
+        numElemFinished=0;
+        if (mainStep < TEST_REPEAT_TIMES) {
+                       doTests(NULL);
+            return;
+        }
+    }  
+    
+       //do some final output
+       allTestsProxy[0].reportSts();
+};
+
+void Main::exitTest(){
+       //do some final output
+       qsort(mainTimes, TEST_REPEAT_TIMES, sizeof(double), cmpDFunc);
+       double sum = 0.0;
+       for(int i=0; i<TEST_REPEAT_TIMES-3; i++) sum += mainTimes[i];
+       int maxi = TEST_REPEAT_TIMES;
+       CkPrintf("Global timestep info: avg time: %.3f [%.3f, %.3f, %.3f] (us)\n", sum/(maxi-3), mainTimes[0], mainTimes[maxi/2], mainTimes[maxi-1]);
+       
+    if(curTestMode == 0){
+           CkPrintf("Charm++ CkLoop Test done\n\n");
+        curTestMode++;
+        mainStep = 0;
+        numElemFinished = 0;
+        doTests(NULL);
+    }else if(curTestMode == 1){        
+        CkPrintf("OpenMP Test done\n");
+        CkExit();
+    }
+}
+
+void Main::doTests(CkQdMsg *msg) {
+    delete msg;
+
+    //CkPrintf("===========Starting mainstep %d===========\n", mainStep);    
+
+    if(mainStep == 0){
+        if(curTestMode == 0){
+            CkPrintf("===Start CkLoop Test===\n");
+        }else if(curTestMode == 1){
+            int numthds = 0;
+            int openmpid;
+            #pragma omp parallel private(openmpid)
+            {
+                openmpid = omp_get_thread_num();
+                if(openmpid == 0) numthds = omp_get_num_threads();
+            }
+            CkPrintf("===Start OpenMP Test with %d threads===\n", numthds);
+        }
+    }
+    
+       timestamp = CkWallTimer(); //record the start time of the whole test
+    for (int i=0; i<totalElems; i++) {
+        allTestsProxy[i].doTest(mainStep, curTestMode);
+        //allTestsProxy[8].doTest(mainStep, curTestMode);
+    }
+};
+
+void Main::processCommandLine(int argc,char ** argv) {
+    for (int i=0; i<argc; i++) {
+        if (argv[i][0]=='-') {
+            switch (argv[i][1]) {
+            case 't':
+                loopTimes = atoi(argv[++i]);
+                break;
+            case 'c':
+                numChunks = atoi(argv[++i]);
+                break;
+            case 'm':
+                runningMode = atoi(argv[++i]);
+                break;
+            case 'a':
+                totalElems = atoi(argv[++i]);
+                break;
+            case 'p':
+                threadNum = atoi(argv[++i]);
+                break;
+            }
+        }
+    }
+}
+
+
+TestInstance::TestInstance() {
+    CkPrintf("test case %d is created on proc %d node %d\n", thisIndex, CkMyPe(),CkMyNode());
+    
+       hasTest = 0; 
+       allTimes = new double[TEST_REPEAT_TIMES];
+       allResults = new int[TEST_REPEAT_TIMES];
+       memset(allTimes, 0, sizeof(double)*TEST_REPEAT_TIMES);
+       memset(allResults, 0, sizeof(int)*TEST_REPEAT_TIMES);
+}
+
+void TestInstance::doTest(int curstep, int curTestMode) {
+    //printf("On proc %d node %d, begin parallel execution for test case %d %dth iteration\n", CkMyPe(), CkMyNode(), thisIndex,flag);
+       hasTest = 1;
+       int result;
+       
+    double timerec = CkWallTimer();
+    
+    if(curTestMode == 0){
+           CkLoop_Parallelize(doCalc, 0, NULL, numChunks, 0, loopTimes-1, 1, &result, CKLOOP_INT_SUM);
+    }else if(curTestMode == 1){
+        result = openMPWork(0, loopTimes-1);
+    }
+    
+    allTimes[curstep]=(CkWallTimer()-timerec)*1e6;
+       allResults[curstep] = result;
+       
+    mainProxy.done();
+}
+
+void TestInstance::reportSts(){
+       if(hasTest){
+               //do sts output
+               qsort(allTimes, TEST_REPEAT_TIMES, sizeof(double), cmpDFunc);
+               double sum = 0.0;
+               for(int i=0; i<TEST_REPEAT_TIMES-3; i++) sum += allTimes[i];
+               
+               double avgResult = 0.0;
+               for(int i=0; i<TEST_REPEAT_TIMES; i++) avgResult += allResults[i];
+               avgResult /= TEST_REPEAT_TIMES;
+               
+               int maxi = TEST_REPEAT_TIMES;
+               CkPrintf("Test instance[%d]: result:%.3f, avg time: %.3f [%.3f, %.3f, %.3f] (us)\n",thisIndex, avgResult, sum/(maxi-3), allTimes[0], allTimes[maxi/2], allTimes[maxi-1]);           
+    }
+       
+       if(thisIndex == totalElems-1) mainProxy.exitTest();
+       else thisProxy[thisIndex+1].reportSts();
+}
+
+#include "hello.def.h"
+
diff --git a/examples/charm++/ckloop/simpleLoopBench/hello.ci b/examples/charm++/ckloop/simpleLoopBench/hello.ci
new file mode 100644 (file)
index 0000000..80f9ede
--- /dev/null
@@ -0,0 +1,26 @@
+mainmodule hello {
+  readonly CProxy_Main mainProxy;
+  readonly CProxy_TestInstance allTestsProxy;
+  readonly CProxy_FuncCkLoop ckLoopProxy;
+  readonly int totalElems;
+  readonly int loopTimes;
+  readonly int numChunks;
+  readonly int runningMode;
+  
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+    entry void done(void);
+    entry void doTests(CkQdMsg *msg);
+       entry void exitTest();
+  };
+
+  array [1D] TestInstance{
+    entry TestInstance();
+    entry void doTest(int curstep, int curTestMode);
+       entry void reportSts();
+   };
+
+  group cyclicMap : CkArrayMap{
+    entry cyclicMap();
+  };  
+};
diff --git a/examples/charm++/ckloop/simpleLoopBench/hello.h b/examples/charm++/ckloop/simpleLoopBench/hello.h
new file mode 100644 (file)
index 0000000..a4e9897
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef _HELLO_H
+#define _HELLO_H
+
+#include "charm++.h"
+#include "CkLoopAPI.h"
+#include "hello.decl.h"
+#include <assert.h>
+
+class Main : public Chare {
+private:
+       int numElemFinished; //record the number of test instances finished in a timestep
+       double timestamp;
+       int mainStep; //the global counter of timesteps
+       double *mainTimes; //record each timestep from test initiation to test finish (i.e. from the point of main)
+    
+    int curTestMode; //0: ckLoop; 1: OpenMP
+
+public:
+    Main(CkArgMsg* m) ;
+    void done(void);
+       void exitTest();
+    void doTests(CkQdMsg *msg);
+    void processCommandLine(int argc,char ** argv);
+};
+
+class TestInstance : public CBase_TestInstance {
+       int hasTest; //used for reporting statistics
+       
+    double *allTimes; //record time taken for each timestep
+       int *allResults; //record the result of each timestep
+       
+public:
+    TestInstance();
+    ~TestInstance() {
+               delete [] allTimes;
+               delete [] allResults;
+       }
+    TestInstance(CkMigrateMessage *m) {}
+    void doTest(int curstep, int curTestMode);
+       void reportSts();
+};
+
+class cyclicMap : public CkArrayMap {
+public:
+    int procNum(int, const CkArrayIndex &idx) {
+        int index = *(int *)idx.data();
+        int nid = (index/CkMyNodeSize())%CkNumNodes();
+        int rid = index%CkMyNodeSize();
+        return CkNodeFirst(nid)+rid;
+    }
+};
+
+#endif
diff --git a/src/libs/ck-libs/ckloop/CkLoop.C b/src/libs/ck-libs/ckloop/CkLoop.C
new file mode 100644 (file)
index 0000000..401101d
--- /dev/null
@@ -0,0 +1,490 @@
+#include "CkLoop.h"
+#include <pthread.h>
+
+#if !USE_CONVERSE_NOTIFICATION
+#include "qd.h"
+#endif
+
+/*====Beginning of pthread-related variables and impelementation====*/
+//__thread is not portable, but it works almost everywhere if pthread works
+//After C++11, this should be thread_local
+static __thread pthread_cond_t thdCondition; //the signal var of each pthread to be notified
+static __thread pthread_mutex_t thdLock; //the lock associated with the condition variables
+
+static FuncCkLoop *mainHelper = NULL;
+static int mainHelperPhyRank = 0;
+static int numPhysicalPEs = 0;
+static CurLoopInfo *pthdLoop = NULL; //the pthread-version is always synchronized
+static pthread_mutex_t **allLocks = NULL;
+static pthread_cond_t **allConds = NULL;
+static pthread_t *ndhThreads = NULL;
+static volatile int gCrtCnt = 0;
+static volatile int exitFlag = 0;
+
+#if CMK_OS_IS_LINUX
+#include <sys/syscall.h>
+#endif
+
+static int HelperOnCore() {
+#if CMK_OS_IS_LINUX
+    char fname[64];
+    sprintf(fname, "/proc/%d/task/%ld/stat", getpid(), syscall(SYS_gettid));
+    FILE *ifp = fopen(fname, "r");
+    if (ifp == NULL) return -1;
+    fseek(ifp, 0, SEEK_SET);
+    char str[128];
+    for (int i=0; i<39; i++) fscanf(ifp, "%s", str);
+    fclose(ifp);
+    return atoi(str);
+#else
+    return -1;
+#endif
+}
+
+static void *ndhThreadWork(void *id) {
+    size_t myId = (size_t) id;
+
+    //further improvement of this affinity setting!!
+    int myPhyRank = (myId+mainHelperPhyRank)%numPhysicalPEs;
+    //printf("thread[%d]: affixed to rank %d\n", myId, myPhyRank);
+    myPhyRank = myId;
+    CmiSetCPUAffinity(myPhyRank);
+
+    pthread_mutex_init(&thdLock, NULL);
+    pthread_cond_init(&thdCondition, NULL);
+
+    allLocks[myId-1] = &thdLock;
+    allConds[myId-1] = &thdCondition;
+
+    __sync_add_and_fetch(&gCrtCnt, 1);
+
+    while (1) {
+        //printf("thread[%ld]: on core %d with main %d\n", myId, HelperOnCore(), mainHelperPhyRank);
+        if (exitFlag) break;
+        pthread_mutex_lock(&thdLock);
+        pthread_cond_wait(&thdCondition, &thdLock);
+        pthread_mutex_unlock(&thdLock);
+        /* kids ID range: [1 ~ numHelpers-1] */
+        if (mainHelper->needTreeBcast()) {
+            //notify my children
+            int myKid = myId*TREE_BCAST_BRANCH+1;
+            for (int i=0; i<TREE_BCAST_BRANCH; i++, myKid++) {
+                if (myKid >= mainHelper->getNumHelpers()) break;
+                //all locks and conditions exclude the main thread, so index needs to be subtracted by one
+                pthread_mutex_lock(allLocks[myKid-1]);
+                pthread_cond_signal(allConds[myKid-1]);
+                pthread_mutex_unlock(allLocks[myKid-1]);
+            }
+        }
+        pthdLoop->stealWork();
+    }
+}
+
+void FuncCkLoop::createPThreads() {
+    int numThreads = numHelpers - 1;
+    allLocks = (pthread_mutex_t **)malloc(sizeof(void *)*numThreads);
+    allConds = (pthread_cond_t **)malloc(sizeof(void *)*numThreads);
+    memset(allLocks, 0, sizeof(void *)*numThreads);
+    memset(allConds, 0, sizeof(void *)*numThreads);
+
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+    ndhThreads = new pthread_t[numThreads];
+    mainHelperPhyRank = CmiOnCore();
+    numPhysicalPEs = CmiNumCores();
+    if (mainHelperPhyRank == -1) mainHelperPhyRank = 0;
+    for (int i=1; i<=numThreads; i++) {
+        pthread_create(ndhThreads+i, &attr, ndhThreadWork, (void *)i);
+    }
+    while (gCrtCnt != numThreads); //wait for all threads to finish creation
+}
+
+void FuncCkLoop::exit() {
+    if (mode == CKLOOP_PTHREAD) {
+        exitFlag = 1;
+        for (int i=0; i<numHelpers-1; i++)
+            pthread_join(ndhThreads[i], NULL);
+        delete [] ndhThreads;
+        free(allLocks);
+        free(allConds);
+        delete pthdLoop;
+    }
+}
+
+/*====End of pthread-related variables and impelementation====*/
+
+
+/* Note: Those event ids should be unique globally!! */
+#define CKLOOP_TOTAL_WORK_EVENTID  139
+#define CKLOOP_FINISH_SIGNAL_EVENTID 143
+
+static FuncCkLoop *globalCkLoop = NULL;
+
+FuncCkLoop::FuncCkLoop(int mode_, int numThreads_) {
+    traceRegisterUserEvent("ckloop total work",CKLOOP_TOTAL_WORK_EVENTID);
+    traceRegisterUserEvent("ckloop finish signal",CKLOOP_FINISH_SIGNAL_EVENTID);
+
+    mode = mode_;
+
+    CmiAssert(globalCkLoop==NULL);
+    globalCkLoop = this;
+
+    if (mode == CKLOOP_USECHARM) {
+        //CkPrintf("FuncCkLoop created on node %d\n", CkMyNode());
+        numHelpers = CkMyNodeSize();
+        helperPtr = new FuncSingleHelper *[numHelpers];
+        useTreeBcast = (numHelpers >= USE_TREE_BROADCAST_THRESHOLD);
+
+        int pestart = CkNodeFirst(CkMyNode());
+
+        for (int i=0; i<numHelpers; i++) {
+            CkChareID helper;
+            CProxy_FuncSingleHelper::ckNew(numHelpers, &helper, pestart+i);
+        }
+    } else if (mode == CKLOOP_PTHREAD) {
+        helperPtr = NULL;
+
+        numHelpers = numThreads_;
+        useTreeBcast = (numHelpers >= USE_TREE_BROADCAST_THRESHOLD);
+        pthdLoop = new CurLoopInfo(FuncCkLoop::MAX_CHUNKS);
+        mainHelper = this;
+        createPThreads();
+    }
+}
+
+int FuncCkLoop::MAX_CHUNKS = 64;
+
+#if CMK_TRACE_ENABLED
+#define TRACE_START(id) _start = CmiWallTimer()
+#define TRACE_BRACKET(id) traceUserBracketEvent(id,_start,CmiWallTimer())
+#else
+#define TRACE_START(id)
+#define TRACE_BRACKET(id)
+#endif
+
+#define ALLOW_MULTIPLE_UNSYNC 1
+void FuncCkLoop::parallelizeFunc(HelperFn func, int paramNum, void * param,
+                                     int numChunks, int lowerRange,
+                                     int upperRange, int sync,
+                                     void *redResult, REDUCTION_TYPE type) {
+
+    double _start; //may be used for tracing
+
+    if (numChunks > MAX_CHUNKS) {
+        CkPrintf("CkLoop[%d]: WARNING! chunk is set to MAX_CHUNKS=%d\n", CmiMyPe(), MAX_CHUNKS);
+        numChunks = MAX_CHUNKS;
+    }
+
+    /* "stride" determines the number of loop iterations to be done in each chunk
+     * for chunk indexed at 0 to remainder-1, stride is "unit+1";
+     * for chunk indexed at remainder to numChunks-1, stride is "unit"
+    int stride;
+     */
+    CurLoopInfo *curLoop = NULL;
+
+    //for using nodequeue
+    TRACE_START(CKLOOP_TOTAL_WORK_EVENTID);
+    if (mode == CKLOOP_USECHARM) {
+        FuncSingleHelper *thisHelper = helperPtr[CkMyRank()];
+#if USE_CONVERSE_NOTIFICATION
+#if ALLOW_MULTIPLE_UNSYNC
+        ConverseNotifyMsg *notifyMsg = thisHelper->getNotifyMsg();
+#else
+        ConverseNotifyMsg *notifyMsg = thisHelper->notifyMsg;
+#endif
+        curLoop = (CurLoopInfo *)(notifyMsg->ptr);
+        curLoop->set(numChunks, func, lowerRange, upperRange, paramNum, param);
+        if (useTreeBcast) {
+            int loopTimes = TREE_BCAST_BRANCH>(CmiMyNodeSize()-1)?CmiMyNodeSize()-1:TREE_BCAST_BRANCH;
+            //just implicit binary tree
+            int pe = CmiMyRank()+1;
+            for (int i=0; i<loopTimes; i++, pe++) {
+                if (pe >= CmiMyNodeSize()) pe -= CmiMyNodeSize();
+                CmiPushPE(pe, (void *)(notifyMsg));
+            }
+        } else {
+            for (int i=CmiMyRank()+1; i<numHelpers; i++) {
+                CmiPushPE(i, (void *)(notifyMsg));
+            }
+            for (int i=0; i<CmiMyRank(); i++) {
+                CmiPushPE(i, (void *)(notifyMsg));
+            }
+        }
+#else
+#if ALLOW_MULTIPLE_UNSYNC
+        curLoop = thisHelper->getNewTask();
+#else
+        curLoop = thisHelper->taskBuffer[0];
+#endif
+        curLoop->set(numChunks, func, lowerRange, upperRange, paramNum, param);
+        CpvAccess(_qd)->create(numHelpers-1);
+        if (useTreeBcast) {
+            int loopTimes = TREE_BCAST_BRANCH>(CmiMyNodeSize()-1)?CmiMyNodeSize()-1:TREE_BCAST_BRANCH;
+            //just implicit binary tree
+            int pe = CmiMyRank()+1;
+            for (int i=0; i<loopTimes; i++, pe++) {
+                if (pe >= CmiMyNodeSize()) pe -= CmiMyNodeSize();
+                CharmNotifyMsg *one = thisHelper->getNotifyMsg();
+                one->ptr = (void *)curLoop;
+                envelope *env = UsrToEnv(one);
+                env->setObjPtr(thisHelper->ckGetChareID().objPtr);
+                CmiPushPE(pe, (void *)(env));
+            }
+        } else {
+            for (int i=CmiMyRank()+1; i<numHelpers; i++) {
+                CharmNotifyMsg *one = thisHelper->getNotifyMsg();
+                one->ptr = (void *)curLoop;
+                envelope *env = UsrToEnv(one);
+                env->setObjPtr(thisHelper->ckGetChareID().objPtr);
+                //printf("[%d] sending a msg %p (env=%p) to [%d]\n", CmiMyRank(), one, env, i);
+                CmiPushPE(i, (void *)(env));
+            }
+            for (int i=0; i<CmiMyRank(); i++) {
+                CharmNotifyMsg *one = thisHelper->getNotifyMsg();
+                one->ptr = (void *)curLoop;
+                envelope *env = UsrToEnv(one);
+                env->setObjPtr(thisHelper->ckGetChareID().objPtr);
+                //printf("[%d] sending a msg %p (env=%p) to [%d]\n", CmiMyRank(), one, env, i);
+                CmiPushPE(i, (void *)(env));
+            }
+        }
+#endif
+    } else if (mode == CKLOOP_PTHREAD) {
+        int numThreads = numHelpers-1;
+        curLoop = pthdLoop;
+        curLoop->set(numChunks, func, lowerRange, upperRange, paramNum, param);
+        int numNotices = numThreads;
+        if (useTreeBcast) {
+            numNotices = TREE_BCAST_BRANCH>=numThreads?numThreads:TREE_BCAST_BRANCH;
+        }
+        for (int i=0; i<numNotices; i++) {
+            pthread_mutex_lock(allLocks[i]);
+            pthread_cond_signal(allConds[i]);
+            pthread_mutex_unlock(allLocks[i]);
+        }
+        //in this mode, it's always synced
+        sync = 1;
+    }
+
+    curLoop->stealWork();
+    TRACE_BRACKET(CKLOOP_TOTAL_WORK_EVENTID);
+
+    //printf("[%d]: parallelize func %p with [%d ~ %d] divided into %d chunks using loop=%p\n", CkMyPe(), func, lowerRange, upperRange, numChunks, curLoop);
+
+    TRACE_START(CKLOOP_FINISH_SIGNAL_EVENTID);
+    curLoop->waitLoopDone(sync);
+    TRACE_BRACKET(CKLOOP_FINISH_SIGNAL_EVENTID);
+
+    //printf("[%d]: finished parallelize func %p with [%d ~ %d] divided into %d chunks using loop=%p\n", CkMyPe(), func, lowerRange, upperRange, numChunks, curLoop);
+
+    if (type!=CKLOOP_NONE)
+        reduce(curLoop->getRedBufs(), redResult, type, numChunks);
+    return;
+}
+
+#define COMPUTE_REDUCTION(T) {\
+    for(int i=0; i<numChunks; i++) {\
+     result += *((T *)(redBufs[i])); \
+     /*CkPrintf("CkLoop Reduce: %d\n", result);*/ \
+    }\
+}
+
+void FuncCkLoop::reduce(void **redBufs, void *redBuf, REDUCTION_TYPE type, int numChunks) {
+    switch (type) {
+    case CKLOOP_INT_SUM: {
+        int result=0;
+        COMPUTE_REDUCTION(int)
+        *((int *)redBuf) = result;
+        break;
+    }
+    case CKLOOP_FLOAT_SUM: {
+        float result=0;
+        COMPUTE_REDUCTION(float)
+        *((float *)redBuf) = result;
+        break;
+    }
+    case CKLOOP_DOUBLE_SUM: {
+        double result=0;
+        COMPUTE_REDUCTION(double)
+        *((double *)redBuf) = result;
+        break;
+    }
+    default:
+        break;
+    }
+}
+
+CpvStaticDeclare(int, NdhStealWorkHandler);
+static void RegisterCkLoopHdlrs() {
+    CpvInitialize(int, NdhStealWorkHandler);
+    CpvAccess(NdhStealWorkHandler) = CmiRegisterHandler((CmiHandler)SingleHelperStealWork);
+}
+
+extern int _charmHandlerIdx;
+FuncSingleHelper::FuncSingleHelper(int numHelpers) {
+    totalHelpers = numHelpers;
+#if USE_CONVERSE_NOTIFICATION
+    notifyMsgBufSize = TASK_BUFFER_SIZE;
+#else
+    notifyMsgBufSize = TASK_BUFFER_SIZE*totalHelpers;
+#endif
+
+    CmiAssert(globalCkLoop!=NULL);
+    thisCkLoop = globalCkLoop;
+
+    nextFreeNotifyMsg = 0;
+#if USE_CONVERSE_NOTIFICATION
+    notifyMsg = (ConverseNotifyMsg *)malloc(sizeof(ConverseNotifyMsg)*notifyMsgBufSize);
+    for (int i=0; i<notifyMsgBufSize; i++) {
+        ConverseNotifyMsg *tmp = notifyMsg+i;
+        if (thisCkLoop->useTreeBcast) {
+            tmp->srcRank = CmiMyRank();
+        } else {
+            tmp->srcRank = -1;
+        }
+        tmp->ptr = (void *)(new CurLoopInfo(FuncCkLoop::MAX_CHUNKS));
+        CmiSetHandler(tmp, CpvAccess(NdhStealWorkHandler));
+    }
+#else
+    nextFreeTaskBuffer = 0;
+    notifyMsg = (CharmNotifyMsg **)malloc(sizeof(CharmNotifyMsg *)*notifyMsgBufSize);
+    for (int i=0; i<notifyMsgBufSize; i++) {
+        CharmNotifyMsg *tmp = new(sizeof(int)*8)CharmNotifyMsg; //allow msg priority bits
+        notifyMsg[i] = tmp;
+        if (thisCkLoop->useTreeBcast) {
+            tmp->srcRank = CmiMyRank();
+        } else {
+            tmp->srcRank = -1;
+        }
+        tmp->ptr = NULL;
+        envelope *env = UsrToEnv(tmp);
+        env->setMsgtype(ForChareMsg);
+        env->setEpIdx(CkIndex_FuncSingleHelper::stealWork(NULL));
+        env->setSrcPe(CkMyPe());
+        CmiSetHandler(env, _charmHandlerIdx);
+        //env->setObjPtr has to be called when a notification msg is sent
+    }
+    taskBuffer = (CurLoopInfo **)malloc(sizeof(CurLoopInfo *)*TASK_BUFFER_SIZE);
+    for (int i=0; i<TASK_BUFFER_SIZE; i++) {
+        taskBuffer[i] = new CurLoopInfo(FuncCkLoop::MAX_CHUNKS);
+    }
+#endif
+    globalCkLoop->helperPtr[CkMyRank()] = this;
+}
+
+void FuncSingleHelper::stealWork(CharmNotifyMsg *msg) {
+#if !USE_CONVERSE_NOTIFICATION
+    int srcRank = msg->srcRank;
+    CurLoopInfo *loop = (CurLoopInfo *)msg->ptr;
+    if (srcRank >= 0) {
+        //means using tree-broadcast to send the notification msg
+        int relPE = CmiMyRank()-msg->srcRank;
+        if (relPE<0) relPE += CmiMyNodeSize();
+
+        //CmiPrintf("Rank[%d]: got msg from src %d with relPE %d\n", CmiMyRank(), msg->srcRank, relPE);
+        relPE=relPE*TREE_BCAST_BRANCH+1;
+        for (int i=0; i<TREE_BCAST_BRANCH; i++, relPE++) {
+            if (relPE >= CmiMyNodeSize()) break;
+            int pe = (relPE + msg->srcRank)%CmiMyNodeSize();
+            //CmiPrintf("Rank[%d]: send msg to dst %d (relPE: %d) from src %d\n", CmiMyRank(), pe, relPE, msg->srcRank);
+            CharmNotifyMsg *newone = getNotifyMsg();
+            newone->ptr = (void *)loop;
+            envelope *env = UsrToEnv(newone);
+            env->setObjPtr(thisCkLoop->helperPtr[pe]->ckGetChareID().objPtr);
+            CmiPushPE(pe, (void *)env);
+        }
+    }
+    loop->stealWork();
+#endif
+}
+
+void SingleHelperStealWork(ConverseNotifyMsg *msg) {
+    int srcRank = msg->srcRank;
+
+    if (srcRank >= 0) {
+        //means using tree-broadcast to send the notification msg
+
+        //int numHelpers = CmiMyNodeSize(); //the value of "numHelpers" should be obtained somewhere else
+        int relPE = CmiMyRank()-msg->srcRank;
+        if (relPE<0) relPE += CmiMyNodeSize();
+
+        //CmiPrintf("Rank[%d]: got msg from src %d with relPE %d\n", CmiMyRank(), msg->srcRank, relPE);
+        relPE=relPE*TREE_BCAST_BRANCH+1;
+        for (int i=0; i<TREE_BCAST_BRANCH; i++, relPE++) {
+            if (relPE >= CmiMyNodeSize()) break;
+            int pe = (relPE + msg->srcRank)%CmiMyNodeSize();
+            //CmiPrintf("Rank[%d]: send msg to dst %d (relPE: %d) from src %d\n", CmiMyRank(), pe, relPE, msg->srcRank);
+            CmiPushPE(pe, (void *)msg);
+        }
+    }
+    CurLoopInfo *loop = (CurLoopInfo *)msg->ptr;
+    loop->stealWork();
+}
+
+void CurLoopInfo::stealWork() {
+    //indicate the current work hasn't been initialized
+    //or the old work has finished.
+    if (inited == 0) return;
+
+    int first, last;
+    int unit = (upperIndex-lowerIndex+1)/numChunks;
+    int remainder = (upperIndex-lowerIndex+1)-unit*numChunks;
+    int markIdx = remainder*(unit+1);
+
+    int nextChunkId = getNextChunkIdx();
+    int execTimes = 0;
+    while (nextChunkId < numChunks) {
+        if (nextChunkId < remainder) {
+            first = lowerIndex+(unit+1)*nextChunkId;
+            last = first+unit;
+        } else {
+            first = lowerIndex+(nextChunkId - remainder)*unit + markIdx;
+            last = first+unit-1;
+        }
+
+        fnPtr(first, last, redBufs[nextChunkId], paramNum, param);
+        execTimes++;
+        nextChunkId = getNextChunkIdx();
+    }
+    reportFinished(execTimes);
+}
+
+//======================================================================//
+//   End of functions related with FuncSingleHelper                     //
+//======================================================================//
+
+CProxy_FuncCkLoop CkLoop_Init(int mode, int numThreads) {
+    if (mode == CKLOOP_USECHARM) {
+#if USE_CONVERSE_NOTIFICATION
+        CkPrintf("CkLoopLib is used in SMP with a simple dynamic scheduling (converse-level notification) but not using node-level queue\n");
+#else
+        CkPrintf("CkLoopLib is used in SMP with a simple dynamic scheduling (charm-level notifiation) but not using node-level queue\n");
+#endif
+    } else if (mode==CKLOOP_PTHREAD) {
+        CkPrintf("CkLoopLib is used with extra %d pthreads via a simple dynamic scheduling\n", numThreads);
+        CmiAssert(numThreads>0);
+    }
+    return CProxy_FuncCkLoop::ckNew(mode, numThreads);
+}
+
+void CkLoop_Exit(CProxy_FuncCkLoop ckLoop) {
+    ckLoop.exit();
+}
+
+void CkLoop_Parallelize(CProxy_FuncCkLoop ckLoop, HelperFn func,
+                            int paramNum, void * param,
+                            int numChunks, int lowerRange, int upperRange,
+                            int sync,
+                            void *redResult, REDUCTION_TYPE type) {
+    ckLoop[CkMyNode()].ckLocalBranch()->parallelizeFunc(func, paramNum, param, numChunks, lowerRange, upperRange, sync, redResult, type);
+}
+
+void CkLoop_Parallelize(HelperFn func,
+                            int paramNum, void * param,
+                            int numChunks, int lowerRange, int upperRange,
+                            int sync,
+                            void *redResult, REDUCTION_TYPE type) {
+    globalCkLoop->parallelizeFunc(func, paramNum, param, numChunks, lowerRange, upperRange, sync, redResult, type);
+}
+#include "CkLoop.def.h"
diff --git a/src/libs/ck-libs/ckloop/CkLoop.ci b/src/libs/ck-libs/ckloop/CkLoop.ci
new file mode 100644 (file)
index 0000000..605aa7e
--- /dev/null
@@ -0,0 +1,14 @@
+module CkLoop{
+    initproc void RegisterCkLoopHdlrs(void);
+
+    message CharmNotifyMsg;
+
+       nodegroup FuncCkLoop{
+               entry FuncCkLoop(int mode, int numThreads);
+               entry void exit();
+       };
+       chare FuncSingleHelper{
+               entry FuncSingleHelper(int numHelpers);
+               entry void stealWork(CharmNotifyMsg *msg);
+       };
+};
diff --git a/src/libs/ck-libs/ckloop/CkLoop.h b/src/libs/ck-libs/ckloop/CkLoop.h
new file mode 100644 (file)
index 0000000..cbeadff
--- /dev/null
@@ -0,0 +1,235 @@
+#ifndef _CKLOOP_H
+#define _CKLOOP_H
+#include <assert.h>
+
+#include "charm++.h"
+#include "CkLoopAPI.h"
+
+#define USE_TREE_BROADCAST_THRESHOLD 8
+#define TREE_BCAST_BRANCH (4)
+#define CACHE_LINE_SIZE 64
+/* 1. Using converse-level msg, then the msg is always of highest priority.
+ * And the notification msg comes from the singlehelper where the loop parallelization
+ * is initiated.
+ *
+ * 2. Using charm-level msg, then the msg could be set with different priorities.
+ * However, the notification msg comes from the singlehelper where the parallelized
+ * loop is executed.
+ *
+ * */
+#define USE_CONVERSE_NOTIFICATION 1
+
+class FuncSingleHelper;
+
+class CurLoopInfo {
+    friend class FuncSingleHelper;
+
+private:
+    volatile int curChunkIdx;
+    int numChunks;
+    HelperFn fnPtr;
+    int lowerIndex;
+    int upperIndex;
+    int paramNum;
+    void *param;
+    //limitation: only allow single variable reduction of size numChunks!!!
+    void **redBufs;
+    char *bufSpace;
+
+    volatile int finishFlag;
+
+    //a tag to indicate whether the task for this new loop has been inited
+    //this tag is needed to prevent other helpers to run the old task
+    int inited;
+
+public:
+    CurLoopInfo(int maxChunks):numChunks(0),fnPtr(NULL), lowerIndex(-1), upperIndex(0),
+            paramNum(0), param(NULL), curChunkIdx(-1), finishFlag(0), redBufs(NULL), bufSpace(NULL), inited(0) {
+        redBufs = new void *[maxChunks];
+        bufSpace = new char[maxChunks * CACHE_LINE_SIZE];
+        for (int i=0; i<maxChunks; i++) redBufs[i] = (void *)(bufSpace+i*CACHE_LINE_SIZE);
+    }
+
+    ~CurLoopInfo() {
+        delete [] redBufs;
+        delete [] bufSpace;
+    }
+
+    void set(int nc, HelperFn f, int lIdx, int uIdx, int numParams, void *p) {        /*
+      * WARNING: there's a rare data-racing case here. The current loop is
+      * about to finish (just before setting inited to 0; A helper (say B)
+      * just enters the stealWork and passes the inited check. The helper
+      * (say A) is very fast, and starts the next loop, and happens enter
+      * into the middle of this function. Then helper B will face corrupted
+      * task info as it is trying to execute the old loop task!
+      * In reality for user cases, this case happens very rarely!! -Chao Mei
+      */
+        numChunks = nc;
+        fnPtr = f;
+        lowerIndex = lIdx;
+        upperIndex = uIdx;
+        paramNum = numParams;
+        param = p;
+        curChunkIdx = -1;
+        finishFlag = 0;
+        //needs to be set last
+        inited = 1;
+    }
+
+    void waitLoopDone(int sync) {
+        //while(!__sync_bool_compare_and_swap(&finishFlag, numChunks, 0));
+        if (sync) while (finishFlag!=numChunks);
+        //finishFlag = 0;
+        inited = 0;
+    }
+    int getNextChunkIdx() {
+        return __sync_add_and_fetch(&curChunkIdx, 1);
+    }
+    void reportFinished(int counter) {
+        if (counter==0) return;
+        __sync_add_and_fetch(&finishFlag, counter);
+    }
+
+    int isFree() {
+        return finishFlag == numChunks;
+    }
+
+    void **getRedBufs() {
+        return redBufs;
+    }
+
+    void stealWork();
+};
+
+/* FuncCkLoop is a nodegroup object */
+
+typedef struct converseNotifyMsg {
+    char core[CmiMsgHeaderSizeBytes];
+    int srcRank;
+    void *ptr;
+} ConverseNotifyMsg;
+
+class CharmNotifyMsg: public CMessage_CharmNotifyMsg {
+public:
+    int srcRank;
+    void *ptr; //the loop info
+};
+
+class FuncCkLoop : public CBase_FuncCkLoop {
+    friend class FuncSingleHelper;
+
+public:
+    static int MAX_CHUNKS;
+private:
+    int mode;
+
+    int numHelpers; //in pthread mode, the counter includes itself
+    FuncSingleHelper **helperPtr; /* ptrs to the FuncSingleHelpers it manages */
+    int useTreeBcast;
+
+public:
+    FuncCkLoop(int mode_, int numThreads_);
+    ~FuncCkLoop() {
+        delete [] helperPtr;
+    }
+
+    void createPThreads();
+    void exit();
+
+    int getNumHelpers() {
+        return numHelpers;
+    }
+    int needTreeBcast() {
+        return useTreeBcast;
+    }
+
+    void parallelizeFunc(HelperFn func, /* the function that finishes a partial work on another thread */
+                         int paramNum, void * param, /* the input parameters for the above func */
+                         int numChunks, /* number of chunks to be partitioned */
+                         int lowerRange, int upperRange, /* the loop-like parallelization happens in [lowerRange, upperRange] */
+                         int sync=1, /* whether the flow will continue until all chunks have finished */
+                         void *redResult=NULL, REDUCTION_TYPE type=CKLOOP_NONE /* the reduction result, ONLY SUPPORT SINGLE VAR of TYPE int/float/double */
+                        );
+    void reduce(void **redBufs, void *redBuf, REDUCTION_TYPE type, int numChunks);
+};
+
+void SingleHelperStealWork(ConverseNotifyMsg *msg);
+
+/* FuncSingleHelper is a chare located on every core of a node */
+//allowing arbitrary combination of sync and unsync parallelizd loops
+#define TASK_BUFFER_SIZE (3)
+class FuncSingleHelper: public CBase_FuncSingleHelper {
+    friend class FuncCkLoop;
+private:
+    int totalHelpers;
+    int notifyMsgBufSize;
+
+    FuncCkLoop *thisCkLoop;
+#if USE_CONVERSE_NOTIFICATION
+    //this msg is shared across all SingleHelpers
+    ConverseNotifyMsg *notifyMsg;
+#else
+    //acted as a msg buffer for charm-level notification msgs sent to other
+    //SingleHelpers. At each sending,
+    //1. the msg destination chare (SingleHelper) has to be set.
+    //2. the associated loop info has to be set.
+    CharmNotifyMsg **notifyMsg;
+    CurLoopInfo **taskBuffer;
+    int nextFreeTaskBuffer;
+#endif
+    int nextFreeNotifyMsg;
+
+public:
+    FuncSingleHelper(int numHelpers);
+
+    ~FuncSingleHelper() {
+#if USE_CONVERSE_NOTIFICATION
+        for (int i=0; i<notifyMsgBufSize; i++) {
+            ConverseNotifyMsg *tmp = notifyMsg+i;
+            CurLoopInfo *loop = (CurLoopInfo *)(tmp->ptr);
+            delete loop;
+        }
+        free(notifyMsg);
+#else
+        for (int i=0; i<notifyMsgBufSize; i++) delete notifyMsg[i];
+        free(notifyMsg);
+        for (int i=0; i<TASK_BUFFER_SIZE; i++) delete taskBuffer[i];
+        free(taskBuffer);
+#endif
+    }
+#if USE_CONVERSE_NOTIFICATION
+    ConverseNotifyMsg *getNotifyMsg() {
+        while (1) {
+            ConverseNotifyMsg *cur = notifyMsg+nextFreeNotifyMsg;
+            CurLoopInfo *loop = (CurLoopInfo *)(cur->ptr);
+            nextFreeNotifyMsg = (nextFreeNotifyMsg+1)%notifyMsgBufSize;
+            if (loop->isFree()) return cur;
+        }
+        return NULL;
+    }
+#else
+    CharmNotifyMsg *getNotifyMsg() {
+        while (1) {
+            CharmNotifyMsg *cur = notifyMsg[nextFreeNotifyMsg];
+            CurLoopInfo *loop = (CurLoopInfo *)(cur->ptr);
+            nextFreeNotifyMsg = (nextFreeNotifyMsg+1)%notifyMsgBufSize;
+            if (loop==NULL || loop->isFree()) return cur;
+        }
+        return NULL;
+    }
+    CurLoopInfo *getNewTask() {
+        while (1) {
+            CurLoopInfo *cur = taskBuffer[nextFreeTaskBuffer];
+            nextFreeTaskBuffer = (nextFreeTaskBuffer+1)%TASK_BUFFER_SIZE;
+            if (cur->isFree()) return cur;
+        }
+        return NULL;
+    }
+#endif
+
+    void stealWork(CharmNotifyMsg *msg);
+
+    FuncSingleHelper(CkMigrateMessage *m) {}
+};
+
+#endif
diff --git a/src/libs/ck-libs/ckloop/CkLoopAPI.h b/src/libs/ck-libs/ckloop/CkLoopAPI.h
new file mode 100644 (file)
index 0000000..4538218
--- /dev/null
@@ -0,0 +1,46 @@
+#ifndef _CKLOOPAPI_H
+#define _CKLOOPAPI_H
+
+#include "CkLoop.decl.h"
+
+/* "result" is the buffer for reduction result on a single simple-type variable */
+typedef void (*HelperFn)(int first,int last, void *result, int paramNum, void *param);
+
+typedef enum REDUCTION_TYPE {
+    CKLOOP_NONE=0,
+    CKLOOP_INT_SUM,
+    CKLOOP_FLOAT_SUM,
+    CKLOOP_DOUBLE_SUM
+} REDUCTION_TYPE;
+
+#define CKLOOP_USECHARM 1
+#define CKLOOP_PTHREAD 2
+
+class CProxy_FuncCkLoop;
+/*
+ * The default mode is intended to be used in SMP mode
+ * The next mode that uses pthread is intended to be used in a restricted mode where
+ * a node just have one charm PE!
+ **/
+extern CProxy_FuncCkLoop CkLoop_Init(int mode=CKLOOP_USECHARM, int numThreads=0);
+
+extern void CkLoop_Exit(CProxy_FuncCkLoop ckLoop); /* used to free resources if using pthread mode. It should be called on just one PE, say PE 0 */
+
+extern void CkLoop_Parallelize(
+    CProxy_FuncCkLoop ckLoop, /* the proxy to the FuncCkLoop instance */
+    HelperFn func, /* the function that finishes a partial work on another thread */
+    int paramNum, void * param, /* the input parameters for the above func */
+    int numChunks, /* number of chunks to be partitioned */
+    int lowerRange, int upperRange, /* the loop-like parallelization happens in [lowerRange, upperRange] */
+    int sync=1, /* whether the flow will continue unless all chunks have finished */
+    void *redResult=NULL, REDUCTION_TYPE type=CKLOOP_NONE /* the reduction result, ONLY SUPPORT SINGLE VAR of TYPE int/float/double */
+);
+extern void CkLoop_Parallelize(
+    HelperFn func, /* the function that finishes a partial work on another thread */
+    int paramNum, void * param, /* the input parameters for the above func */
+    int numChunks, /* number of chunks to be partitioned */
+    int lowerRange, int upperRange, /* the loop-like parallelization happens in [lowerRange, upperRange] */
+    int sync=1, /* whether the flow will continue unless all chunks have finished */
+    void *redResult=NULL, REDUCTION_TYPE type=CKLOOP_NONE /* the reduction result, ONLY SUPPORT SINGLE VAR of TYPE int/float/double */
+);
+#endif
diff --git a/src/libs/ck-libs/ckloop/Make.depends b/src/libs/ck-libs/ckloop/Make.depends
new file mode 100644 (file)
index 0000000..d20248d
--- /dev/null
@@ -0,0 +1,42 @@
+#generated by make depends
+CkLoop.o: CkLoop.C CkLoop.h ../../../../tmp/charm++.h \
+ ../../../../tmp/charm.h ../../../../tmp/converse.h \
+ ../../../../tmp/conv-config.h ../../../../tmp/conv-autoconfig.h \
+ ../../../../tmp/conv-common.h ../../../../tmp/conv-mach.h \
+ ../../../../tmp/conv-mach-opt.h ../../../../tmp/pup_c.h \
+ ../../../../tmp/queueing.h ../../../../tmp/conv-cpm.h \
+ ../../../../tmp/conv-cpath.h ../../../../tmp/conv-qd.h \
+ ../../../../tmp/conv-random.h ../../../../tmp/conv-lists.h \
+ ../../../../tmp/conv-trace.h ../../../../tmp/persistent.h \
+ ../../../../tmp/debug-conv.h ../../../../tmp/pup.h \
+ ../../../../tmp/middle.h ../../../../tmp/middle-conv.h \
+ ../../../../tmp/cklists.h ../../../../tmp/ckbitvector.h \
+ ../../../../tmp/ckstream.h ../../../../tmp/init.h \
+ ../../../../tmp/ckhashtable.h ../../../../tmp/debug-charm.h \
+ ../../../../tmp/debug-conv++.h ../../../../tmp/simd.h \
+ ../../../../tmp/CkMarshall.decl.h ../../../../tmp/charm++.h \
+ ../../../../tmp/ckarrayindex.h ../../../../tmp/cksection.h \
+ ../../../../tmp/ckcallback.h ../../../../tmp/conv-ccs.h \
+ ../../../../tmp/sockRoutines.h ../../../../tmp/ccs-server.h \
+ ../../../../tmp/ckobjQ.h ../../../../tmp/ckreduction.h \
+ ../../../../tmp/CkReduction.decl.h \
+ ../../../../tmp/CkArrayReductionMgr.decl.h \
+ ../../../../tmp/ckmemcheckpoint.h ../../../../tmp/CkMemCheckpoint.decl.h \
+ ../../../../tmp/readonly.h ../../../../tmp/ckarray.h \
+ ../../../../tmp/cklocation.h ../../../../tmp/LBDatabase.h \
+ ../../../../tmp/lbdb.h ../../../../tmp/LBDBManager.h \
+ ../../../../tmp/LBObj.h ../../../../tmp/LBOM.h ../../../../tmp/LBComm.h \
+ ../../../../tmp/LBMachineUtil.h ../../../../tmp/lbdb++.h \
+ ../../../../tmp/LBDatabase.decl.h ../../../../tmp/NullLB.decl.h \
+ ../../../../tmp/BaseLB.decl.h ../../../../tmp/CkLocation.decl.h \
+ ../../../../tmp/CkArray.decl.h ../../../../tmp/ckfutures.h \
+ ../../../../tmp/CkFutures.decl.h ../../../../tmp/charisma.h \
+ ../../../../tmp/charisma.decl.h ../../../../tmp/tempo.h \
+ ../../../../tmp/tempo.decl.h ../../../../tmp/waitqd.h \
+ ../../../../tmp/waitqd.decl.h ../../../../tmp/sdag.h \
+ ../../../../tmp/ckcheckpoint.h ../../../../tmp/CkCheckpoint.decl.h \
+ ../../../../tmp/ckevacuation.h ../../../../tmp/ckarrayreductionmgr.h \
+ ../../../../tmp/trace.h ../../../../tmp/trace-bluegene.h \
+ ../../../../tmp/envelope.h CkLoopAPI.h CkLoop.decl.h \
+ CkLoop.def.h
+       $(CHARMC) -I../../../../tmp -o CkLoop.o CkLoop.C
diff --git a/src/libs/ck-libs/ckloop/Makefile b/src/libs/ck-libs/ckloop/Makefile
new file mode 100644 (file)
index 0000000..6bb9562
--- /dev/null
@@ -0,0 +1,44 @@
+override OPTS += -lpthread
+CDIR=../../../..
+CHARMC=$(CDIR)/bin/charmc $(OPTS)
+
+MODULE=CkLoop
+LIB = $(CDIR)/lib/libmodule$(MODULE).a
+LIBOBJ = CkLoop.o
+
+CIFILES = CkLoop.ci
+HEADERS = CkLoopAPI.h $(MODULE).decl.h $(MODULE).def.h
+
+all: $(LIB)  headers
+
+$(LIB): $(LIBOBJ)
+       $(CHARMC) -o $(LIB) $(LIBOBJ) 
+
+headers: $(HEADERS)
+       cp $(HEADERS) $(CDIR)/include/
+
+$(MODULE).def.h: $(MODULE).decl.h
+
+$(MODULE).decl.h: CkLoop.ci
+       $(CHARMC) -c $<
+
+clean:
+       rm -f *.o *.decl.h *.def.h $(LIB) headers
+
+include Make.depends
+
+DEPENDFILE = Make.depends
+
+depends:  $(CIFILES) CkLoop.def.h
+       echo "Creating " $(DEPENDFILE) " ...";  \
+       if [ -f $(DEPENDFILE) ]; then \
+           /bin/cp -f $(DEPENDFILE) $(DEPENDFILE).old; \
+        fi; \
+       echo '#generated by make depends' > $(DEPENDFILE); \
+        for i in $(LIBOBJ) ; do \
+             SRCFILE=`basename $$i .o`.C ; \
+              echo "checking dependencies for $$i : $$SRCFILE" ; \
+              g++ -MM -Wno-deprecated -I$(CDIR)/tmp $$SRCFILE >> $(DEPENDFILE); \
+              echo '   $$(CHARMC) -I$(CDIR)/tmp -o '$$i $$SRCFILE >> $(DEPENDFILE) ; \
+        done; 
+