a few examples from the pgms/charm++ tree
authorEric Bohm <ebohm@illinois.edu>
Tue, 7 Sep 2004 21:07:16 +0000 (21:07 +0000)
committerEric Bohm <ebohm@illinois.edu>
Tue, 7 Sep 2004 21:07:16 +0000 (21:07 +0000)
32 files changed:
examples/multiphaseSharedArrays/Makefile_common [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/Makefile [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/mm_sequential.C [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/nepp.h [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/params.h [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/run.sh [new file with mode: 0755]
examples/multiphaseSharedArrays/matmul/sequential/Makefile [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/sequential/maxflops.c [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/sequential/mm1.C [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/sequential/mm1mkn.C [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/sequential/mm2.C [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/sequential/mm2mkn.C [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/sequential/params.h [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/t2d.C [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/t2d.ci [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/test.C [new file with mode: 0644]
examples/multiphaseSharedArrays/matmul/test.ci [new file with mode: 0644]
examples/multiphaseSharedArrays/moldyn/Makefile [new file with mode: 0644]
examples/multiphaseSharedArrays/moldyn/moldyn.C [new file with mode: 0644]
examples/multiphaseSharedArrays/moldyn/moldyn.ci [new file with mode: 0644]
examples/multiphaseSharedArrays/moldyn/nepp.h [new file with mode: 0644]
examples/multiphaseSharedArrays/moldyn/params.h [new file with mode: 0644]
examples/multiphaseSharedArrays/simpleTestVarsize/Makefile [new file with mode: 0644]
examples/multiphaseSharedArrays/simpleTestVarsize/params.h [new file with mode: 0644]
examples/multiphaseSharedArrays/simpleTestVarsize/t3.C [new file with mode: 0644]
examples/multiphaseSharedArrays/simpleTestVarsize/t3.ci [new file with mode: 0644]
examples/multiphaseSharedArrays/simpleTestVarsize/testV.C [new file with mode: 0644]
examples/multiphaseSharedArrays/simpletest/Makefile [new file with mode: 0644]
examples/multiphaseSharedArrays/simpletest/params.h [new file with mode: 0644]
examples/multiphaseSharedArrays/simpletest/run.sh [new file with mode: 0755]
examples/multiphaseSharedArrays/simpletest/t3.C [new file with mode: 0644]
examples/multiphaseSharedArrays/simpletest/t3.ci [new file with mode: 0644]

diff --git a/examples/multiphaseSharedArrays/Makefile_common b/examples/multiphaseSharedArrays/Makefile_common
new file mode 100644 (file)
index 0000000..4d73d46
--- /dev/null
@@ -0,0 +1,86 @@
+# emacs mode line -*- mode: makefile -*-
+# needs $(PGM)
+
+OPTS=
+CDIR=../../../..
+CHARMC=$(CDIR)/bin/charmc -language charm++ $(OPTS)
+
+# Rules to convert .ci to .decl.h and .def.h
+.SUFFIXES:
+.SUFFIXES: .o .C .def.h .decl.h .ci .h
+
+.ci.decl.h:
+       $(CHARMC) -c $<
+
+.decl.h.def.h:
+       @true
+
+all : opt
+
+$(PGM) : $(PGM).o
+       $(CHARMC) -o $(PGM) -module msa $(PGM).o -lm
+
+headers: $(PGM).decl.h $(PGM).def.h $(HEADERS)
+       touch headers
+
+$(PGM).o : $(PGM).ci $(PGM).C headers
+       $(CHARMC) -c $(PGM).C
+
+proj:
+       make OPTS="-tracemode projections -O3" $(PGM)
+
+para:
+       make OPTS="-memory paranoid -g $(OPTS)" $(PGM)
+
+opt:
+       make OPTS="-O3 $(OPTS)" $(PGM)
+
+g:
+       make OPTS="-g $(OPTS)" $(PGM)
+
+gprof:
+       make OPTS="-pg $(OPTS)" $(PGM)
+
+clean:
+       rm -f *.o *.decl.h *.def.h charmrun test $(PGM) headers
+
+c: clean
+
+tests: w2 w3 w4
+
+# test with two worker threads
+w2: opt
+       @echo ================================================================
+       ./charmrun +p1 $(PGM) 2
+       @echo ================================================================
+       ./charmrun +p2 $(PGM) 2
+       @echo ================================================================
+       ./charmrun +p3 $(PGM) 2
+       @echo ================================================================
+       ./charmrun +p4 $(PGM) 2
+
+# test with three worker threads
+w3: opt
+       @echo ================================================================
+       ./charmrun +p1 $(PGM) 3
+       @echo ================================================================
+       ./charmrun +p2 $(PGM) 3
+       @echo ================================================================
+       ./charmrun +p3 $(PGM) 3
+       @echo ================================================================
+       ./charmrun +p4 $(PGM) 3
+
+#test with various worker threads
+w4: opt
+       @echo ================================================================
+       ./charmrun +p1 $(PGM) 4
+       @echo ================================================================
+       ./charmrun +p2 $(PGM) 4
+       @echo ================================================================
+       ./charmrun +p3 $(PGM) 4
+       @echo ================================================================
+       ./charmrun +p4 $(PGM) 4
+       @echo ================================================================
+       ./charmrun +p4 $(PGM) 8
+       @echo ================================================================
+       ./charmrun +p4 $(PGM) 32
diff --git a/examples/multiphaseSharedArrays/matmul/Makefile b/examples/multiphaseSharedArrays/matmul/Makefile
new file mode 100644 (file)
index 0000000..bafb717
--- /dev/null
@@ -0,0 +1,107 @@
+
+HEADERS=nepp.h params.h 
+PGM=t2d
+#PGM=matmul2D
+include ../Makefile_common
+
+# test : test.C test.ci Makefile
+#      rm -f test
+#      $(CHARMC) test.ci
+#      $(CHARMC) -o test -module msa test.C -lm
+
+tt: opt
+       time ./charmrun ++nodegroup one +p1 t2d 1 200000000 2000 5000 300 1 1
+       time ./charmrun ++nodegroup one +p1 t2d 1 200000000 2000 5000 300 4 1
+
+t: opt
+       w
+       time ./charmrun ++nodegroup one +p1 t2d 1 200000000 2000 5000 300 4 1
+
+lem: opt
+       prun -n1 ./t2d 1 200000000 2000 5000 300 4 1
+       prun -n1 ./t2d 1 200000000 2000 5000 300 1 1
+       prun -n2 ./t2d 2 200000000 2000 5000 300 4 1
+       prun -n4 ./t2d 4 200000000 2000 5000 300 4 1
+       prun -n1 ./t2d 4 200000000 2000 5000 300 4 1
+       prun -n2 ./t2d 8 200000000 2000 5000 300 4 1
+       prun -n4 ./t2d 16 200000000 2000 5000 300 4 1
+
+tung: opt
+       $(HOME)/gennodelist.csh
+       ./charmrun +p1 t2d 1 200000000 2000 5000 300 4 1
+       ./charmrun +p2 t2d 2 200000000 2000 5000 300 4 1
+       ./charmrun +p1 t2d 4 200000000 2000 5000 300 4 1
+       ./charmrun +p2 t2d 8 200000000 2000 5000 300 4 1
+
+tung832:
+       ./charmrun +p8 t2d 32 200000000 2000 5000 300 4 1
+
+tung816:
+       ./charmrun +p8 t2d 16 200000000 2000 5000 300 4 1
+
+t1:
+       #make OPTS="-DOLD" c opt
+       w
+       time ./charmrun ++nodegroup one +p1 t2d 1 200000000 2000 5000 300 1 1
+
+# quick test on 1 cpu
+test: opt
+       ./charmrun +p1 $(PGM) 2 1048576 100 500 100 1
+       ./charmrun +p1 $(PGM) 4 1048576 100 500 100 2
+       ./charmrun +p1 $(PGM) 8 1048576 100 500 100 3
+       ./charmrun +p1 $(PGM) 2 1048576 100 500 100 4
+       ./charmrun +p1 $(PGM) 4 1048576 100 500 100 5
+       ./charmrun +p1 $(PGM) 8 1048576 100 500 100 6
+
+# quick test on 2 cpus
+test2: opt
+       ./charmrun +p2 $(PGM) 2 1048576 100 500 100 1
+       ./charmrun +p2 $(PGM) 4 1048576 100 500 100 2
+       ./charmrun +p2 $(PGM) 8 1048576 100 500 100 3
+       ./charmrun +p2 $(PGM) 4 1048576 100 500 100 4
+       ./charmrun +p2 $(PGM) 16 1048576 100 500 100 5
+       ./charmrun +p2 $(PGM) 64 1048576 100 500 100 6
+
+#bigtest
+bt: opt
+       time ./charmrun +p1 $(PGM) 1 200000000 2000 5000 300
+#      time ./charmrun +p1 t2d 1 200000000 2000 5000 300 1 1
+#opteron       time ./charmrun ++local +p1 t2d 1 200000000 2000 5000 300 4 1
+
+bt2d: opt
+       time ./charmrun +p1 $(PGM) 1 200000000 2000 5000 300 2
+
+bt3d: opt
+       time ./charmrun +p1 $(PGM) 1 200000000 2000 5000 300 3
+
+bt4:
+       time ./charmrun +p1 $(PGM) 1 200000000 2000 5000 300 4 1
+
+seq:
+       g++ -o mm_sequential -O3 mm_sequential.C
+
+seqs-icc:
+       icc -no_cpprt -DBIGGER_ALGO -DGLOBAL_VAR_VERSION -USKIP_INIT -o mmgii -O3 mm_sequential.c
+       icc -no_cpprt -DBIGGER_ALGO -DGLOBAL_VAR_VERSION -DSKIP_INIT -o mmgsi -O3 mm_sequential.c
+       icc -no_cpprt -DBIGGER_ALGO -UGLOBAL_VAR_VERSION -USKIP_INIT -o mmmii -O3 mm_sequential.c
+       icc -no_cpprt -DBIGGER_ALGO -UGLOBAL_VAR_VERSION -DSKIP_INIT -o mmmsi -O3 mm_sequential.c
+
+seqs:
+       g++ -DBIGGER_ALGO -DGLOBAL_VAR_VERSION -DSKIP_INIT -o mmgs -O3 mm_sequential.C
+       g++ -DBIGGER_ALGO -DGLOBAL_VAR_VERSION -USKIP_INIT -o mmgi -O3 mm_sequential.C
+       g++ -DBIGGER_ALGO -UGLOBAL_VAR_VERSION -DSKIP_INIT -o mmms -O3 mm_sequential.C
+       g++ -DBIGGER_ALGO -UGLOBAL_VAR_VERSION -USKIP_INIT -o mmmi -O3 mm_sequential.C
+
+smallseqs:
+       g++ -UBIGGER_ALGO -DGLOBAL_VAR_VERSION -DSKIP_INIT -o mmgs_s -O3 mm_sequential.C
+       g++ -UBIGGER_ALGO -DGLOBAL_VAR_VERSION -USKIP_INIT -o mmgi_s -O3 mm_sequential.C
+
+runseqs:
+       time ./mmgs
+       time ./mmgi
+       time ./mmms
+       time ./mmmi
+
+runsmallseqs:
+       time ./mmgs_s
+       time ./mmgi_s
diff --git a/examples/multiphaseSharedArrays/matmul/mm_sequential.C b/examples/multiphaseSharedArrays/matmul/mm_sequential.C
new file mode 100644 (file)
index 0000000..2339141
--- /dev/null
@@ -0,0 +1,211 @@
+#include <sys/times.h>
+/**
+   The first algo compiles fine, but not on lemieux with g++ -O3 where
+   it generates an 880 MB a.out and does not run.  The second algo
+   compiles fine on lemieux also.
+
+   I ran both algos on skill and times were almost the same.
+ */
+
+// #define BIGGER_ALGO
+// #define GLOBAL_VAR_VERSION
+// #define SKIP_INIT
+
+// incomplete
+double CmiWallTimer()
+{
+  struct tms temp;
+  double currenttime;
+  int now;
+
+  now = times(&temp);
+  return now;
+}
+
+
+#ifdef BIGGER_ALGO
+
+#include <stdio.h>
+#include <stdlib.h>
+
+const unsigned int ROWS1=2000;
+const unsigned int COLS1=5000;
+const unsigned int ROWS2=COLS1;
+const unsigned int COLS2=300;
+
+// const unsigned int ROWS1=2048;
+// const unsigned int COLS1=2048;
+// const unsigned int ROWS2=COLS1;
+// const unsigned int COLS2=2048;
+
+// debugging
+#define false 0;
+#define true 1;
+const int verbose = false;
+const int do_test = false;  // If true, tests results, etc.
+
+#ifdef GLOBAL_VAR_VERSION
+  double arr1[ROWS1][COLS1];
+  double arr2[ROWS2][COLS2];
+  double prod[ROWS1][COLS2];
+
+  void malloc_arrays(){}
+
+#else
+  double **arr1;
+  double **arr2;
+  double **prod;
+
+  //================================================================
+  // http://remus.rutgers.edu/cs314/s2004/decarlo/lec/twod.c
+  
+  /* Dynamically allocate a 2D "array" of integers, that is rows X cols
+   * and accessed as arr[i][j] where i=0..rows-1, and j=0..cols-1
+   *
+   * It is stored as a 1D array of ROWS*COLS integers, with an array
+   * of ROWS integer pointers which refer to the appropriate places in this
+   * larger array
+   */
+  double **make2DarrayFlat(int rows, int cols)
+  {
+      int i;
+      double **p, *base;
+      /* Allocate array of pointers and 1D block of integers */
+      p = (double **)malloc(rows * sizeof(double *));
+      base = (double *)malloc(rows * cols * sizeof(double));    
+      if (p == NULL || base == NULL)
+          return NULL;
+      /* Set pointers for each row */
+      for (i = 0; i < rows; i++) {
+          p[i] = &base[cols * i];
+      }
+      return p;
+  }
+  
+  //================================================================
+  
+  void malloc_arrays()
+  {
+          arr1 = make2DarrayFlat(ROWS1, COLS1);
+          arr2 = make2DarrayFlat(ROWS2, COLS2);
+          prod = make2DarrayFlat(ROWS1, COLS2);
+  }
+
+#endif
+
+void init_arrays()
+{
+       unsigned int i, j;
+
+       for(i = 0; i < ROWS1; i++)
+               for(j = 0; j < COLS1; j++)
+                       arr1[i][j] = 1.0;
+
+       for(i = 0; i < ROWS2; i++)
+               for(j = 0; j < COLS2; j++)
+                       arr2[i][j] = 1.0;
+
+}
+
+void multiply()
+{
+    if(verbose) printf("Multiplying\n");
+
+    unsigned int i, j, k;
+    for(j = 0; j < COLS2; j++)
+       for(i = 0; i < ROWS1; i++)
+            {
+                double r = 0.0;
+                for(k = 0; k < ROWS2; k++)
+                    r += arr1[i][k] * arr2[k][j];
+                prod[i][j] = r;
+            }
+}
+
+void test_result()
+{
+    unsigned int i, j, k, msg;
+    msg = 1;
+    for(i = 0; i < ROWS1; i++)
+        for(j = 0; j < COLS2; j++)
+            if(msg && (prod[i][j] != 1.0*ROWS2))
+                {
+                    printf("Element [%d][%d] inconsistent\n", i, j);
+                    msg = 0;
+                }
+}
+
+int main()
+{
+#ifdef GLOBAL_VAR_VERSION
+    printf("global_var ");
+#else
+    printf("malloc_version ");
+#endif
+#ifdef SKIP_INIT
+    printf("skip_init ");
+#else
+    printf("do_init ");
+#endif
+
+    printf("%d %d %d\n", ROWS1, COLS1, COLS2);
+
+    malloc_arrays();
+#ifndef SKIP_INIT
+    init_arrays();
+#endif
+    multiply();
+
+    if (do_test)
+        test_result();
+
+    return 0;
+}
+
+#else
+
+#include <stdio.h>
+
+#define M 2000
+#define K 5000
+#define N 300
+
+double A[M][K];
+double B[K][N];
+double C[M][N];
+
+main()
+{
+  int i, j, k;
+
+  /*
+  // Check if values are auto-initialized to 0
+  unsigned int numNonZero = 0;
+  for (i=0; i<M; i++)
+    for (k=0; k<K; k++)
+      if (A[i][k] != 0)
+          numNonZero++;
+
+  for (k=0; k<K; k++)
+    for (j=0; j<N; j++)
+        if (B[k][j] != 0)
+            numNonZero++;
+  printf("Num non zero = %d\n", numNonZero);
+  */
+
+#ifndef SKIP_INIT
+  for (i=0; i<M; i++)
+    for (k=0; k<K; k++)
+      A[i][k] = 1.0;
+
+  for (k=0; k<K; k++)
+    for (j=0; j<N; j++)
+      B[k][j] = 1.0;
+#endif
+
+  for (i=0; i<M; i++)
+    for (j=0; j<N; j++)
+      for (k=0; k<K; k++)
+        C[i][j] += A[i][k] * B[k][j];
+}
+#endif
diff --git a/examples/multiphaseSharedArrays/matmul/nepp.h b/examples/multiphaseSharedArrays/matmul/nepp.h
new file mode 100644 (file)
index 0000000..db13b4d
--- /dev/null
@@ -0,0 +1,5 @@
+//#define NEPP 5000
+#define NEPP 5000
+#define NEPP_C 300
+//#define NEPP_C 5000
+
diff --git a/examples/multiphaseSharedArrays/matmul/params.h b/examples/multiphaseSharedArrays/matmul/params.h
new file mode 100644 (file)
index 0000000..8c63c43
--- /dev/null
@@ -0,0 +1,17 @@
+unsigned int NUM_WORKERS = 2;
+unsigned int bytes = 1024*1024;
+unsigned int ROWS1 = 100;
+unsigned int COLS1 = 500;
+unsigned int COLS2 = 100;
+unsigned int ROWS2 = COLS1;
+unsigned int DECOMPOSITION = 1; // 1D matmul is the default, i.e. i=subset of ROWS1
+// 4 = 1D stripmined
+bool detailedTimings = false;
+
+// Run the version without prefetching
+const bool runPrefetchVersion=false;
+
+// debugging
+const bool verbose = false;
+const bool do_test = true;  // If true, tests results, etc.
+
diff --git a/examples/multiphaseSharedArrays/matmul/run.sh b/examples/multiphaseSharedArrays/matmul/run.sh
new file mode 100755 (executable)
index 0000000..7985521
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/sh
+# Shell script to test for multiple test cases
+
+touch outputs
+for rows1 in 1000 5000 10000; do
+  for cols1 in 500 750 1000; do
+    for cols2 in 1000 5000 10000; do
+      for mbytes in 128 64 32 16 8 4 2 1; do
+       for num_workers in 1 2 4 8 16 32; do
+          rm -rf params.h
+          printf "unsigned int bytes = %d*1024*1024;\n" $mbytes >> params.h
+          printf "unsigned int ROWS1 = %d;\n" $rows1 >> params.h
+          printf "unsigned int COLS1 = %d;\n" $cols1 >> params.h
+          printf "unsigned int COLS2 = %d;\n" $cols2 >> params.h
+          printf "unsigned int ROWS2 = COLS1;\n" >> params.h
+          printf "unsigned int NUM_WORKERS = %d;\n" $num_workers >> params.h
+          printf "\n" >> params.h
+  
+          rm -f t2d
+          make OPTS=-O3 -s
+          for num_pes in 4 8 16 32; do
+            ./charmrun t2d +p$num_pes >> outputs
+          done
+        done
+      done
+    done
+  done
+done
diff --git a/examples/multiphaseSharedArrays/matmul/sequential/Makefile b/examples/multiphaseSharedArrays/matmul/sequential/Makefile
new file mode 100644 (file)
index 0000000..118bac8
--- /dev/null
@@ -0,0 +1,39 @@
+
+all: small run
+
+t:
+       echo are you on will?
+       w
+       g++ -O3 -o mm2mkn mm2mkn.C
+       /usr/bin/time ./mm2mkn
+
+
+max:
+       # dont do -O3, the loop is compiled away
+       g++ -o maxflops maxflops.c
+       # 20 Gig float ops takes:
+       /usr/bin/time ./maxflops
+       # prowess 1.5 GF
+
+small:
+       g++ -O3 -o mm1 mm1.C
+       g++ -O3 -o mm1mkn mm1mkn.C
+       g++ -O3 -o mm2 mm2.C
+       g++ -O3 -o mm2mkn mm2mkn.C
+
+run:
+       /usr/bin/time ./mm1mkn
+       /usr/bin/time ./mm2mkn
+       /usr/bin/time ./mm1
+       /usr/bin/time ./mm2
+
+big:
+       g++ -DBIG -O3 -o mm1 mm1.C
+       g++ -DBIG -O3 -o mm1mkn mm1mkn.C
+       g++ -DBIG -O3 -o mm2 mm2.C
+       g++ -DBIG -O3 -o mm2mkn mm2mkn.C
+       /usr/bin/time ./mm2mkn
+       /usr/bin/time ./mm2
+
+clean:
+       rm mm1 mm1mkn mm2 mm2mkn
diff --git a/examples/multiphaseSharedArrays/matmul/sequential/maxflops.c b/examples/multiphaseSharedArrays/matmul/sequential/maxflops.c
new file mode 100644 (file)
index 0000000..7387e6e
--- /dev/null
@@ -0,0 +1,10 @@
+
+main()
+{
+    register double result = 1.0;
+    // 10 times
+    for(register unsigned long int i=0; i<10; i++)
+        // 2 GF
+        for(register unsigned long int j=0; j<1000000000; j++)
+            result += result * 1.01;
+}
diff --git a/examples/multiphaseSharedArrays/matmul/sequential/mm1.C b/examples/multiphaseSharedArrays/matmul/sequential/mm1.C
new file mode 100644 (file)
index 0000000..93487f1
--- /dev/null
@@ -0,0 +1,23 @@
+#include "params.h"
+
+double A[M][K];
+double B[K][N];
+double C[M][N];
+
+main()
+{
+  int i, j, k;
+
+  for (i=0; i<M; i++)
+    for (k=0; k<K; k++)
+      A[i][k] = 1.0;
+
+  for (k=0; k<K; k++)
+    for (j=0; j<N; j++)
+      B[k][j] = 1.0;
+
+  for (i=0; i<M; i++)
+    for (j=0; j<N; j++)
+      for (k=0; k<K; k++)
+        C[i][j] += A[i][k] * B[k][j];
+}
diff --git a/examples/multiphaseSharedArrays/matmul/sequential/mm1mkn.C b/examples/multiphaseSharedArrays/matmul/sequential/mm1mkn.C
new file mode 100644 (file)
index 0000000..70c8fcf
--- /dev/null
@@ -0,0 +1,23 @@
+#include "params.h"
+
+double A[M][K];
+double B[K][N];
+double C[M][N];
+
+main()
+{
+  int i, j, k;
+
+  for (i=0; i<M; i++)
+    for (k=0; k<K; k++)
+      A[i][k] = 1.0;
+
+  for (k=0; k<K; k++)
+    for (j=0; j<N; j++)
+      B[k][j] = 1.0;
+
+  for (i=0; i<M; i++)
+    for (k=0; k<K; k++)
+      for (j=0; j<N; j++)
+        C[i][j] += A[i][k] * B[k][j];
+}
diff --git a/examples/multiphaseSharedArrays/matmul/sequential/mm2.C b/examples/multiphaseSharedArrays/matmul/sequential/mm2.C
new file mode 100644 (file)
index 0000000..1da4d00
--- /dev/null
@@ -0,0 +1,40 @@
+#include "params.h"
+#include <stdlib.h>
+
+/*
+  double A[M][K];
+  double B[K][N];
+  double C[M][N];
+*/
+
+main()
+{
+    double* A[M];
+    double* B[K];
+    double* C[M];
+    int i, j, k, l;
+
+    for (i=0; i<M; i++) {
+        A[i] = (double *) malloc(K*sizeof(double));
+        for (j=0; j<K; j++)
+            A[i][j] = (double) i * 1000.0 + (double) j;
+    }
+
+    for (i=0; i<K; i++) {
+        B[i] = (double *) malloc(N*sizeof(double));
+        for (j=0; j<N; j++)
+            B[i][j] = (double) i * 1000.0 + (double) j;
+    }
+
+    for (i=0; i<M; i++) {
+        C[i] = (double *) malloc(N*sizeof(double));
+        for (j=0; j<N; j++)
+            C[i][j] = 0.0;
+    }
+
+    for (i=0; i<M; i++)
+        for (j=0; j<N; j++)
+            for (k=0; k<K; k++)
+                C[i][j] += A[i][k] * B[k][j];
+
+}
diff --git a/examples/multiphaseSharedArrays/matmul/sequential/mm2mkn.C b/examples/multiphaseSharedArrays/matmul/sequential/mm2mkn.C
new file mode 100644 (file)
index 0000000..5de7b77
--- /dev/null
@@ -0,0 +1,40 @@
+#include "params.h"
+#include <stdlib.h>
+
+/*
+  double A[M][K];
+  double B[K][N];
+  double C[M][N];
+*/
+
+main()
+{
+    double* A[M];
+    double* B[K];
+    double* C[M];
+    int i, j, k, l;
+
+    for (i=0; i<M; i++) {
+        A[i] = (double *) malloc(K*sizeof(double));
+        for (j=0; j<K; j++)
+            A[i][j] = (double) i * 1000.0 + (double) j;
+    }
+
+    for (i=0; i<K; i++) {
+        B[i] = (double *) malloc(N*sizeof(double));
+        for (j=0; j<N; j++)
+            B[i][j] = (double) i * 1000.0 + (double) j;
+    }
+
+    for (i=0; i<M; i++) {
+        C[i] = (double *) malloc(N*sizeof(double));
+        for (j=0; j<N; j++)
+            C[i][j] = 0.0;
+    }
+
+    for (i=0; i<M; i++)
+        for (k=0; k<K; k++)
+            for (j=0; j<N; j++)
+                C[i][j] += A[i][k] * B[k][j];
+
+}
diff --git a/examples/multiphaseSharedArrays/matmul/sequential/params.h b/examples/multiphaseSharedArrays/matmul/sequential/params.h
new file mode 100644 (file)
index 0000000..475f959
--- /dev/null
@@ -0,0 +1,14 @@
+
+#ifdef BIG
+
+#define M 4096
+#define K 4096
+#define N 4096
+
+#else
+
+#define M 2000
+#define K 5000
+#define N 300
+
+#endif
diff --git a/examples/multiphaseSharedArrays/matmul/t2d.C b/examples/multiphaseSharedArrays/matmul/t2d.C
new file mode 100644 (file)
index 0000000..e277cf3
--- /dev/null
@@ -0,0 +1,743 @@
+// -*- mode: c++; tab-width: 4 -*-
+
+// When running 1D, make NEPP = COL1
+// When running 2D, same
+// When running 3D, make NEPP = subset of COL1
+
+#include "nepp.h"
+#include "msa/msa.h"
+
+#ifdef PUP_EVERY
+typedef MSA2D<double, DefaultEntry<double,true>, NEPP, MSA_ROW_MAJOR> MSA2DRowMjr;
+#ifdef OLD
+typedef MSA2D<double, DefaultEntry<double,true>, NEPP, MSA_COL_MAJOR> MSA2DColMjr;
+#else
+typedef MSA2D<double, DefaultEntry<double,true>, NEPP, MSA_ROW_MAJOR> MSA2DColMjr;
+#endif
+typedef MSA2D<double, DefaultEntry<double,true>, NEPP_C, MSA_ROW_MAJOR> MSA2DRowMjrC;
+
+#else
+typedef MSA2D<double, DefaultEntry<double,false>, NEPP, MSA_ROW_MAJOR> MSA2DRowMjr;
+#ifdef OLD
+typedef MSA2D<double, DefaultEntry<double,false>, NEPP, MSA_COL_MAJOR> MSA2DColMjr;
+#else
+typedef MSA2D<double, DefaultEntry<double,false>, NEPP, MSA_ROW_MAJOR> MSA2DColMjr;
+#endif
+typedef MSA2D<double, DefaultEntry<double,false>, NEPP_C, MSA_ROW_MAJOR> MSA2DRowMjrC;
+#endif
+
+#include "t2d.decl.h"
+
+#include <assert.h>
+#include <math.h>
+#include "params.h"
+
+const double epsilon = 0.00000001;
+inline int notequal(double v1, double v2)
+{
+    return (fabs(v1 - v2) > epsilon);
+}
+
+class t2d : public CBase_t2d
+{
+protected:
+    double start_time;
+    CProxy_TestArray workers;
+    int reallyDone;
+
+public:
+    t2d(CkArgMsg* m)
+    {
+        // Usage: a.out [number_of_worker_threads [max_bytes]]
+        if(m->argc >1 ) NUM_WORKERS=atoi(m->argv[1]);
+        if(m->argc >2 ) bytes=atoi(m->argv[2]);
+        if(m->argc >3 ) ROWS1=atoi(m->argv[3]);
+        if(m->argc >4 ) ROWS2=COLS1=atoi(m->argv[4]);
+        if(m->argc >5 ) COLS2=atoi(m->argv[5]);
+        if(m->argc >6 ) DECOMPOSITION=atoi(m->argv[6]); // 1D, 2D, 3D
+        if(m->argc >7 ) detailedTimings= ((atoi(m->argv[7])!=0)?true:false);
+        delete m;
+        reallyDone = 0;
+
+        MSA2DRowMjr arr1(ROWS1, COLS1, NUM_WORKERS, bytes);        // row major
+        MSA2DColMjr arr2(ROWS2, COLS2, NUM_WORKERS, bytes);        // column major
+        MSA2DRowMjrC prod(ROWS1, COLS2, NUM_WORKERS, bytes);        // product matrix
+
+        workers = CProxy_TestArray::ckNew(arr1, arr2, prod, NUM_WORKERS, NUM_WORKERS);
+        workers.ckSetReductionClient(new CkCallback(CkIndex_t2d::done(NULL), thisProxy));
+
+        start_time = CkWallTimer();
+        workers.Start();
+    }
+
+    // This method gets called twice, and should only terminate the
+    // second time.
+    void done(CkReductionMsg* m)
+    {
+        int *ip = (int*)m->getData();
+        bool prefetchWorked = (*ip==0);
+        delete m;
+
+        if (reallyDone == 0) {
+            workers.Kontinue();
+            reallyDone++;
+
+            double end_time = CkWallTimer();
+
+            const char TAB = '\t';
+
+            char hostname[100];
+            gethostname(hostname, 100);
+
+            ckout << CkNumPes() << TAB
+                                 << ROWS1 << TAB
+                  << COLS1 << TAB
+                  << ROWS2 << TAB
+                  << COLS2 << TAB
+                  << NUM_WORKERS << TAB
+                  << bytes << TAB
+                  << (runPrefetchVersion? (prefetchWorked?"Y":"N"): "U") << TAB
+                  << end_time - start_time << TAB
+                  << NEPP << TAB
+                  << DECOMPOSITION << TAB
+                  << hostname
+                  << endl;
+
+        } else {
+            CkExit();
+        }
+    }
+};
+
+// get the chunk for a given index
+int GetChunkForIndex(int index, int maxIndex, int numWorkers)
+{
+    int rangeSize = maxIndex / numWorkers;
+    int chunk;
+
+    // find which chare is going to process the current node
+    if(index <= (maxIndex % numWorkers) * (rangeSize + 1) - 1)
+        chunk = index/(rangeSize + 1);
+    else
+        chunk = maxIndex%numWorkers + (index - (maxIndex%numWorkers) * (rangeSize + 1))/rangeSize;
+
+    return chunk;
+}
+
+// Returns start and end
+void GetMyIndices(unsigned int maxIndex, unsigned int myNum, unsigned int numWorkers,
+                  unsigned int& start, unsigned int& end)
+{
+    int rangeSize = maxIndex / numWorkers;
+    if(myNum < maxIndex % numWorkers)
+    {
+        start = myNum * (rangeSize + 1);
+        end = start + rangeSize;
+    }
+    else
+    {
+        start = myNum * rangeSize + maxIndex % numWorkers;
+        end = start + rangeSize - 1;
+    }
+}
+
+// class MatmulHelper {
+// public:
+//     unsigned int iStart, iEnd, jStart, jEnd, kStart, kEnd;
+//     MatmulHelper(unsigned int ROWS1_, unsigned int COLS1_, unsigned int COLS2_)
+//         : iStart(0), iEnd(ROWS1_-1),  // A's rows
+//           kStart(0), kEnd(COLS1_-1),  // inner
+//           jStart(0), jEnd(COLS2_-1)   // B's cols
+//     {}
+// }
+
+// class MatmulHelper1D {
+    
+// }
+
+class TestArray : public CBase_TestArray
+{
+private:
+    // prefetchWorked keeps track of whether the prefetches succeeded or not.
+    bool prefetchWorked;
+    CkVec<double> times;
+    CkVec<const char*> description;
+
+    // ================================================================
+    // 2D calculations
+
+    inline int numWorkers2D() {
+        static int n = 0;
+
+        if (n==0) {
+            n = (int)(sqrt(numWorkers));
+            CkAssert(n*n == numWorkers);
+        }
+
+        return n;
+    }
+
+    // Convert a 1D ChareArray index into a 2D x dimension index
+    inline unsigned int toX() {
+        return thisIndex/numWorkers2D();
+    }
+    // Convert a 1D ChareArray index into a 2D y dimension index
+    inline unsigned int toY() {
+        return thisIndex%numWorkers2D();
+    }
+
+    // ================================================================
+    // 3D calculations
+    inline int numWorkers3D() {
+        static int n = 0;
+
+        if (n==0) {
+            n = (int)(cbrt(numWorkers));
+            CkAssert(n*n*n == numWorkers);
+        }
+
+        return n;
+    }
+
+    // Convert a 1D ChareArray index into a 3D x dimension index
+    inline unsigned int toX3D() {
+        int b = (numWorkers3D()*numWorkers3D());
+        return thisIndex/b;
+    }
+    // Convert a 1D ChareArray index into a 3D y dimension index
+    inline unsigned int toY3D() {
+        int b = (numWorkers3D()*numWorkers3D());
+        return (thisIndex%b)/numWorkers3D();
+    }
+    // Convert a 1D ChareArray index into a 3D z dimension index
+    inline unsigned int toZ3D() {
+        int b = (numWorkers3D()*numWorkers3D());
+        return (thisIndex%b)%numWorkers3D();
+    }
+
+    // ================================================================
+
+protected:
+    MSA2DRowMjr arr1;       // row major
+    MSA2DColMjr arr2;       // column major
+    MSA2DRowMjrC prod;       // product matrix
+
+    unsigned int rows1, rows2, cols1, cols2, numWorkers;
+
+    void EnrollArrays()
+    {
+        arr1.enroll(numWorkers); // barrier
+        arr2.enroll(numWorkers); // barrier
+        prod.enroll(numWorkers); // barrier
+    }
+
+    void FillArrays()
+    {
+        // fill in our portion of the array
+        unsigned int rowStart, rowEnd, colStart, colEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+        GetMyIndices(cols2, thisIndex, numWorkers, colStart, colEnd);
+
+        // fill them in with 1
+        for(unsigned int r = rowStart; r <= rowEnd; r++)
+            for(unsigned int c = 0; c < cols1; c++)
+                arr1.set(r, c) = 1.0;
+
+        for(unsigned int c = colStart; c <= colEnd; c++)
+            for(unsigned int r = 0; r < rows2; r++)
+                arr2.set(r, c) = 1.0;
+
+    }
+
+    void FillArrays2D()
+    {
+        unsigned int rowStart, rowEnd, colStart, colEnd;
+        unsigned int r, c;
+
+        // fill in our portion of the A matrix
+        GetMyIndices(rows1, toX(), numWorkers2D(), rowStart, rowEnd);
+        GetMyIndices(cols1, toY(), numWorkers2D(), colStart, colEnd);
+        // CkPrintf("p%dw%d: FillArray2D A = %d %d %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd, colStart, colEnd);
+
+        // fill them in with 1
+        for(r = rowStart; r <= rowEnd; r++)
+            for(c = colStart; c <= colEnd; c++)
+                arr1.set(r, c) = 1.0;
+
+        // fill in our portion of the B matrix
+        GetMyIndices(rows2, toX(), numWorkers2D(), rowStart, rowEnd);
+        GetMyIndices(cols2, toY(), numWorkers2D(), colStart, colEnd);
+        // CkPrintf("p%dw%d: FillArray2D B = %d %d %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd, colStart, colEnd);
+        // fill them in with 1
+        for(r = rowStart; r <= rowEnd; r++)
+            for(c = colStart; c <= colEnd; c++)
+                arr2.set(r, c) = 1.0;
+    }
+
+    void SyncArrays()
+    {
+        arr1.sync();
+        arr2.sync();
+    }
+
+    void TestResults(bool prod_test=true)
+    {
+        int errors = 0;
+        bool ok=true;
+
+        // verify the results, print out first error only
+        ok=true;
+        for(unsigned int r = 0; ok && r < rows1; r++) {
+            for(unsigned int c = 0; ok && c < cols1; c++) {
+                if(notequal(arr1.get(r, c), 1.0)) {
+                    ckout << "[" << CkMyPe() << "," << thisIndex << "] arr1 -- Illegal element at (" << r << "," << c << ") " << arr1.get(r,c) << endl;
+                    ok=false;
+                    errors++;
+                }
+            }
+        }
+
+        ok=true;
+        for(unsigned int c = 0; ok && c < cols2; c++) {
+            for(unsigned int r = 0; ok && r < rows2; r++) {
+                if(notequal(arr2.get(r, c), 1.0)) {
+                    ckout << "[" << CkMyPe() << "," << thisIndex << "] arr2 -- Illegal element at (" << r << "," << c << ") " << arr2.get(r,c) << endl;
+                    ok=false;
+                    errors++;
+                }
+            }
+        }
+
+        //arr1.FreeMem();
+        //arr2.FreeMem();
+
+        if(prod_test)
+        {
+            ok = true;
+            for(unsigned int c = 0; ok && c < cols2; c++) {
+                for(unsigned int r = 0; ok && r < rows1; r++) {
+                    if(notequal(prod.get(r,c), 1.0 * cols1)) {
+                        ckout << "[" << CkMyPe() << "] result  -- Illegal element at (" << r << "," << c << ") " << prod.get(r,c) << endl;
+                        ok=false;
+                        errors++;
+                    }
+                }
+            }
+        }
+
+        if (errors!=0) CkAbort("Incorrect array elements detected!");
+    }
+
+    void Contribute()
+    {
+        int dummy = prefetchWorked?0:1;
+        contribute(sizeof(int), &dummy, CkReduction::sum_int);
+    }
+
+    // ============================= 1D ===================================
+
+    void FindProductNoPrefetch() {
+#ifdef OLD
+        FindProductNoPrefetchNMK();
+#else
+        FindProductNoPrefetchMKN_RM();
+#endif
+    }
+
+    // new, but bad perf
+    // improved perf by taking the prod.accu out of the innermost loop, up 2
+    // further improved perf by taking the arr1.get out of the innermost loop, up 1.
+    void FindProductNoPrefetchMKN_RM()
+    {
+        CkAssert(arr2.getArrayLayout() == MSA_ROW_MAJOR);
+//         CkPrintf("reached\n");
+        unsigned int rowStart, rowEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+
+        double *result = new double[cols2];
+        for(unsigned int r = rowStart; r <= rowEnd; r++) { // M
+            for(unsigned int c = 0; c < cols2; c++)
+                result[c] = 0;
+            for(unsigned int k = 0; k < cols1; k++) { // K
+                double a = arr1.get(r,k);
+                for(unsigned int c = 0; c < cols2; c++) { // N
+                    result[c] += a * arr2.get(k,c);
+//                     prod.set(r,c) = result; // @@ to see if accu is the delay
+//                     prod.accumulate(prod.getIndex(r,c), result);
+                }
+//              assert(!notequal(result, 1.0*cols1));
+            }
+            for(unsigned int c = 0; c < cols2; c++) {
+                prod.set(r,c) = result[c];
+            }
+        }
+        delete [] result;
+
+        prod.sync();
+    }
+
+    // old
+    void FindProductNoPrefetchNMK()
+    {
+        unsigned int rowStart, rowEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+
+        for(unsigned int c = 0; c < cols2; c++) { // N
+            for(unsigned int r = rowStart; r <= rowEnd; r++) { // M
+
+                double result = 0.0;
+                for(unsigned int k = 0; k < cols1; k++) { // K
+                    double e1 = arr1.get(r,k);
+                    double e2 = arr2.get(k,c);
+                    result += e1 * e2;
+                }
+//              assert(!notequal(result, 1.0*cols1));
+
+                prod.set(r,c) = result;
+            }
+        }
+
+        prod.sync();
+    }
+
+    // Assumes that the nepp equals the size of a row, i.e. NEPP == COLS1 == ROWS2
+    void FindProductNoPrefetchStripMined()
+    {
+        FindProductNoPrefetchStripMinedMKN_ROWMJR();
+    }
+
+    // Assumes that the nepp equals the size of a row, i.e. NEPP == COLS1 == ROWS2
+    void FindProductNoPrefetchStripMinedNMK()
+    {
+        CkAssert(NEPP == cols1);
+        unsigned int rowStart, rowEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+        CkPrintf("p%dw%d: FPNP2DSM A = %d %d %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd, 0, cols2-1);
+
+        double time1 = CmiWallTimer();
+        for(unsigned int c = 0; c < cols2; c++) { // N
+            for(unsigned int r = rowStart; r <= rowEnd; r++) {  // M
+
+                double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
+                double* b = &(arr2.getPageBottom(arr2.getIndex(0,c),Read_Fault)); // ptr to col of B
+                double result = 0.0;
+                for(unsigned int k = 0; k < cols1; k++) { // K
+                    double e1 = a[k];  // no get
+                    double e2 = b[k];  // no get
+                    result += e1 * e2;
+                }
+//              assert(!notequal(result, 1.0*cols1));
+
+                prod.set(r,c) = result;
+            }
+        }
+        double time2 = CmiWallTimer();
+
+        prod.sync();
+        double time3 = CmiWallTimer();
+        CkPrintf("timings %f %f\n", time2-time1, time3-time2);
+    }
+
+    // Assumes that the nepp equals the size of a row, i.e. NEPP == COLS1 == ROWS2
+    // Assumes CkAssert(NEPP_C == cols2);
+    void FindProductNoPrefetchStripMinedMKN_ROWMJR()
+    {
+        CkAssert(NEPP == cols1);
+        CkAssert(NEPP_C == cols2);
+        CkAssert(arr2.getArrayLayout() == MSA_ROW_MAJOR);
+        unsigned int rowStart, rowEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+//         CkPrintf("p%dw%d: FPNP1DSM_MKN_RM A = %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd);
+
+        for(unsigned int r = rowStart; r <= rowEnd; r++) {  // M
+            double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
+            for(unsigned int c = 0; c < cols2; c++) { // N
+                prod.set(r,c);  // just mark it as updated, need a better way
+            }
+            double* cm = &(prod.getPageBottom(prod.getIndex(r,0),Write_Fault)); // ptr to row of C
+            for(unsigned int k = 0; k < cols1; k++) { // K
+                double* b = &(arr2.getPageBottom(arr2.getIndex(k,0),Read_Fault)); // ptr to row of B
+                for(unsigned int c = 0; c < cols2; c++) { // N
+                    cm[c] += a[k]*b[c];
+//                     prod.accumulate(prod.getIndex(r,c), );
+                }
+            }
+                       if (r%4==0) CthYield();
+        }
+
+        prod.sync();
+    }
+
+    void FindProductWithPrefetch()
+    {
+        // fill in our portion of the array
+        unsigned int rowStart, rowEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+
+        arr1.Unlock(); arr2.Unlock();
+        prefetchWorked = false;
+
+        arr1.Prefetch(rowStart, rowEnd);
+        arr2.Prefetch(0, cols2);
+
+        /* if prefetch fails, switch to non-prefetching version */
+        if(arr1.WaitAll())
+        {
+            if(verbose) ckout << thisIndex << ": Out of buffer in prefetch 1" << endl;
+            FindProductNoPrefetch();
+            return;
+        }
+
+        if(arr2.WaitAll())
+        {
+            if(verbose) ckout << thisIndex << ": Out of buffer in prefetch 2" << endl;
+            FindProductNoPrefetch();
+            return;
+        }
+
+        prefetchWorked = true;
+
+        for(unsigned int c = 0; c < cols2; c++)
+        {
+            for(unsigned int r = rowStart; r <= rowEnd; r++)
+            {
+                double result = 0.0;
+                for(unsigned int k = 0; k < cols1; k++)
+                {
+                    double e1 = arr1.get2(r,k);
+                    double e2 = arr2.get2(k,c);
+                    result += e1 * e2;
+                }
+
+                //ckout << "[" << r << "," << c << "] = " << result << endl;
+
+                prod.set(r,c) = result;
+            }
+            //ckout << thisIndex << "." << endl;
+        }
+
+        //arr1.Unlock(); arr2.Unlock();
+        prod.sync();
+    }
+
+    // ============================= 2D ===================================
+    void FindProductNoPrefetch2DStripMined()
+    {
+        CkAssert(NEPP == cols1);
+        unsigned int rowStart, rowEnd, colStart, colEnd;
+        // fill in our portion of the C matrix
+        GetMyIndices(rows1, toX(), numWorkers2D(), rowStart, rowEnd);
+        GetMyIndices(cols2, toY(), numWorkers2D(), colStart, colEnd);
+
+        for(unsigned int c = colStart; c <= colEnd; c++) {
+            for(unsigned int r = rowStart; r <= rowEnd; r++) {
+
+                double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
+                double* b = &(arr2.getPageBottom(arr2.getIndex(0,c),Read_Fault)); // ptr to col of B
+
+                double result = 0.0;
+                for(unsigned int k = 0; k < cols1; k++) {
+                    double e1 = a[k];
+                    double e2 = b[k];
+                    result += e1 * e2;
+                }
+//              assert(!notequal(result, 1.0*cols1));
+
+                prod.set(r,c) = result;
+            }
+        }
+
+        prod.sync();
+    }
+
+    void FindProductNoPrefetch2D()
+    {
+        unsigned int rowStart, rowEnd, colStart, colEnd;
+        // fill in our portion of the C matrix
+        GetMyIndices(rows1, toX(), numWorkers2D(), rowStart, rowEnd);
+        GetMyIndices(cols2, toY(), numWorkers2D(), colStart, colEnd);
+
+        for(unsigned int c = colStart; c <= colEnd; c++) {
+            for(unsigned int r = rowStart; r <= rowEnd; r++) {
+
+                double result = 0.0;
+                for(unsigned int k = 0; k < cols1; k++) {
+                    double e1 = arr1.get(r,k);
+                    double e2 = arr2.get(k,c);
+                    result += e1 * e2;
+                }
+//              assert(!notequal(result, 1.0*cols1));
+
+                prod.set(r,c) = result;
+            }
+        }
+
+        prod.sync();
+    }
+
+    // ============================= 3D ===================================
+    void FindProductNoPrefetch3D()
+    {
+        unsigned int rowStart, rowEnd, colStart, colEnd, kStart, kEnd;
+        // fill in our portion of the C matrix
+        GetMyIndices(rows1, toX3D(), numWorkers3D(), rowStart, rowEnd);
+        GetMyIndices(cols2, toY3D(), numWorkers3D(), colStart, colEnd);
+        GetMyIndices(cols1, toZ3D(), numWorkers3D(), kStart, kEnd);
+
+        for(unsigned int c = colStart; c <= colEnd; c++) {
+            for(unsigned int r = rowStart; r <= rowEnd; r++) {
+
+                double result = 0.0;
+                for(unsigned int k = kStart; k <= kEnd; k++) {
+                    double e1 = arr1.get(r,k);
+                    double e2 = arr2.get(k,c);
+                    result += e1 * e2;
+                }
+//              assert(!notequal(result, 1.0*cols1));
+
+                prod.accumulate(prod.getIndex(r,c), result);
+            }
+        }
+
+        prod.sync();
+    }
+
+    void FindProductNoPrefetch3DStripMined()
+    {
+        CkAssert(NEPP == cols1);
+        unsigned int rowStart, rowEnd, colStart, colEnd, kStart, kEnd;
+        // fill in our portion of the C matrix
+        GetMyIndices(rows1, toX3D(), numWorkers3D(), rowStart, rowEnd);
+        GetMyIndices(cols2, toY3D(), numWorkers3D(), colStart, colEnd);
+        GetMyIndices(cols1, toZ3D(), numWorkers3D(), kStart, kEnd);
+
+        for(unsigned int c = colStart; c <= colEnd; c++) {
+            for(unsigned int r = rowStart; r <= rowEnd; r++) {
+
+                double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
+                double* b = &(arr2.getPageBottom(arr2.getIndex(0,c),Read_Fault)); // ptr to col of B
+                double result = 0.0;
+                for(unsigned int k = kStart; k <= kEnd; k++) {
+                    double e1 = a[k];  // no get
+                    double e2 = b[k];  // no get
+                    result += e1 * e2;
+                }
+//              assert(!notequal(result, 1.0*cols1));
+
+                prod.accumulate(prod.getIndex(r,c), result);
+            }
+        }
+
+        prod.sync();
+    }
+
+    // ================================================================
+
+public:
+    TestArray(const MSA2DRowMjr &arr1_, const MSA2DColMjr &arr2_, MSA2DRowMjrC &prod_,
+              unsigned int numWorkers_)
+        : arr1(arr1_), arr2(arr2_), prod(prod_), numWorkers(numWorkers_), prefetchWorked(false),
+          rows1(arr1.getRows()), cols1(arr1.getCols()),
+          rows2(arr2.getRows()), cols2(arr2.getCols())
+    {
+        // ckout << "w" << thisIndex << ":" << rows1 << " " << cols1 << " " << cols2 << endl;
+        times.push_back(CkWallTimer()); // 1
+        description.push_back("constr");
+    }
+
+    TestArray(CkMigrateMessage* m) {}
+
+    ~TestArray()
+    {
+    }
+
+    void Start()
+    {
+        times.push_back(CkWallTimer()); // 2
+        description.push_back("   start");
+
+        EnrollArrays();
+        times.push_back(CkWallTimer()); // 3
+        description.push_back("   enroll");
+
+        if(verbose) ckout << thisIndex << ": filling" << endl;
+        switch(DECOMPOSITION){
+        case 1:
+        case 3:
+        case 4:
+        case 6:
+            FillArrays();
+            break;
+        case 2:
+        case 5:
+            FillArrays2D();
+            break;
+        }
+        times.push_back(CkWallTimer()); // 4
+        description.push_back("  fill");
+
+        if(verbose) ckout << thisIndex << ": syncing" << endl;
+        SyncArrays();
+        times.push_back(CkWallTimer()); // 5
+        description.push_back("    sync");
+
+//         if (do_test) TestResults(0);
+
+        if(verbose) ckout << thisIndex << ": product" << endl;
+
+        switch(DECOMPOSITION) {
+        case 1:
+            if (runPrefetchVersion)
+                FindProductWithPrefetch();
+            else
+                FindProductNoPrefetch();
+            break;
+        case 2:
+            FindProductNoPrefetch2D();
+            break;
+        case 3:
+            FindProductNoPrefetch3D();
+            break;
+        case 4:
+            FindProductNoPrefetchStripMined();
+            break;
+        case 5:
+            FindProductNoPrefetch2DStripMined();
+            break;
+        case 6:
+            FindProductNoPrefetch3DStripMined();
+            break;
+        }
+        times.push_back(CkWallTimer()); // 6
+        description.push_back("    work");
+
+        Contribute();
+    }
+
+    void Kontinue()
+    {
+//         if (do_test) TestResults(0);
+        times.push_back(CkWallTimer()); // 6
+        description.push_back("    redn");
+
+        if(verbose) ckout << thisIndex << ": testing" << endl;
+        if (do_test) TestResults();
+        times.push_back(CkWallTimer()); // 5
+        description.push_back("    test");
+        Contribute();
+
+        if (detailedTimings) {
+            if (thisIndex == 0) {
+                for(int i=1; i<description.length(); i++)
+                    ckout << description[i] << " ";
+                ckout << endl;
+            }
+            ckout << "w" << thisIndex << ":";
+            for(int i=1; i<times.length(); i++)
+                ckout << times[i]-times[i-1] << " ";
+            ckout << endl;
+        }
+    }
+};
+
+#include "t2d.def.h"
diff --git a/examples/multiphaseSharedArrays/matmul/t2d.ci b/examples/multiphaseSharedArrays/matmul/t2d.ci
new file mode 100644 (file)
index 0000000..b22170e
--- /dev/null
@@ -0,0 +1,34 @@
+// -*- mode: c++; tab-width: 4 -*-
+mainmodule t2d
+{
+    readonly unsigned int DECOMPOSITION;
+    readonly bool detailedTimings;
+
+    mainchare t2d
+    {
+        entry void t2d(CkArgMsg*);
+        entry void done(CkReductionMsg*);
+    };
+
+    array[1D] TestArray
+    {
+        entry void TestArray(MSA2DRowMjr arr1, MSA2DColMjr arr2, MSA2DRowMjrC prod, unsigned int numWorkers);
+        entry [threaded] void Start();
+        entry [threaded] void Kontinue();
+    };
+
+    /* Currently, you must explicitly instantiate any
+       MSA templates that you use. */
+// #ifdef PUP_EVERY
+    group MSA_CacheGroup<double, DefaultEntry<double,true>, NEPP>;
+    array [1D] MSA_PageArray<double, DefaultEntry<double,true>, NEPP>;
+    group MSA_CacheGroup<double, DefaultEntry<double,true>, NEPP_C>;
+    array [1D] MSA_PageArray<double, DefaultEntry<double,true>, NEPP_C>;
+// #else
+    group MSA_CacheGroup<double, DefaultEntry<double,false>, NEPP>;
+    array [1D] MSA_PageArray<double, DefaultEntry<double,false>, NEPP>;
+    group MSA_CacheGroup<double, DefaultEntry<double,false>, NEPP_C>;
+    array [1D] MSA_PageArray<double, DefaultEntry<double,false>, NEPP_C>;
+// #endif
+
+};
diff --git a/examples/multiphaseSharedArrays/matmul/test.C b/examples/multiphaseSharedArrays/matmul/test.C
new file mode 100644 (file)
index 0000000..53f79c9
--- /dev/null
@@ -0,0 +1,186 @@
+// -*- mode: c++; tab-width: 4 -*-
+#include "msa/msa.h"
+
+#include <math.h>
+
+const double epsilon = 0.00000001;
+
+// ensure length is a multiple of 8, so that we can run this program on even number
+// of PE's
+// num data items in shared array
+const unsigned int len = 200000;
+// num BYTES in local cache
+const unsigned int bytes = 4*1024*1024;
+
+int notequal(double v1, double v2)
+{
+    double abs1 = fabs(v1 - v2);
+    double abs2 = fabs(v2 - v1);
+
+    // pick the worst case difference
+    double absv = (abs1 < abs2) ? abs2 : abs1;
+
+    return (absv > epsilon);
+}
+
+
+void mainFn()
+{
+    // <data type, page size> (num data items in shared array, num BYTES in local cache)
+    MSA<double, 256> arr1(len, bytes);
+
+    ckout <<  "--- Started processing in MainChare ---" << endl;
+    //CthPrintThreadId(CthSelf());
+
+    for(unsigned int i = 0; i < len; i++)
+        arr1.set(i) = 44.6;
+
+    arr1.sync(1);
+
+    int msg = 1;
+    for(unsigned int i = 0; i < len; i++)
+        if(notequal(arr1.get(i), 44.6))
+        {
+            ckout << "MainChare: Inconsistent element " << i << ", value = " << arr1.get(i) << endl;
+            msg = 0;
+        }
+
+    ckout << "--- MainChare: Done! ---" << endl;
+}
+
+#include "Test.decl.h"
+
+class Test : public CBase_Test
+{
+protected:
+    unsigned int doneCnt;
+    double start_time, end_time;
+
+public:
+    Test(CkArgMsg* m) : doneCnt(0)
+    {
+        delete m;
+        mainFn();
+
+        // now create a distributed array
+        MSA<double, 4096> arr2(len, bytes);
+        MSA<double, 4096> arr3(2*len, bytes/2);
+        MSA<double, 4096> arr4(4*len, 2*bytes);
+
+        start_time = CkWallTimer();
+        //CProxy_TestArray arr = CProxy_TestArray::ckNew(/*arr2.getCacheGroup(),*/ numThreads);
+        //arr.ckSetReductionClient(new CkCallback(CkIndex_Test::done(), thisProxy));
+        //done();
+        CProxy_TestGroup::ckNew(thisProxy, arr2.getCacheGroup(), arr3.getCacheGroup(), arr4.getCacheGroup());
+    }
+
+    void done()
+    {
+        CkExit();
+    }
+
+    void doneg()
+    {
+        doneCnt++;
+        if(doneCnt == CkNumPes())
+        {
+            end_time = CkWallTimer();
+            ckout << "Done! Time required = " << end_time - start_time << endl;
+            CkExit();
+        }
+    }
+};
+
+const int ok_message = 0;
+
+class TestGroup : public CBase_TestGroup
+{
+protected:
+    inline void TestWriteOnce(MSA<double, 4096>& arr1, double val = 0.0)
+    {
+        unsigned int mySectionSize = arr1.length()/CkNumPes();
+        for(unsigned int i = CkMyPe()*mySectionSize; i < (CkMyPe() + 1)*mySectionSize; i++)
+            arr1.set(i) = i*val;
+
+        arr1.sync();
+
+        int msg = 1;
+        unsigned int len = arr1.length();
+        for(unsigned int i = 0; i < len; i++)
+            if(notequal(arr1.get(i), i*val) && msg)
+            {
+                ckout << "[" << CkMyPe() << "]Inconsistent element " << i << ", value = " << arr1.get(i) << endl;
+                msg = 0;
+            }
+
+        if(msg && ok_message) ckout << "[" << CkMyPe() << "]WriteOnce OK" << endl;
+    }
+
+    inline void TestAccumulate(MSA<double, 4096>& arr1, double contrib)
+    {
+        //ckout << "[" << CkMyPe() << "] started sync request" << endl;
+        arr1.sync();
+        //ckout << "[" << CkMyPe() << "] sync request done" << endl;
+
+        // test the accumulate interface
+        for(unsigned int i = 0; i < arr1.length(); i++)
+        {
+            //arr1.set(i) = 0;
+            arr1.accumulate(i, contrib);
+        }
+
+        arr1.sync();
+
+        int msg = 1;
+        for(unsigned int i = 0; i < arr1.length(); i++)
+            if(notequal(arr1.get(i), contrib*CkNumPes()) && msg)
+            {
+                ckout << "[" << CkMyPe() << "]Inconsistent element " << i << ", value = " << arr1.get(i)
+                      << ", expected = " << contrib*CkNumPes() << endl;
+                msg = 0;
+            }
+
+        if(msg && ok_message) ckout << "[" << CkMyPe() << "]Accumulate " << contrib << " OK" << endl;
+    }
+
+public:
+    TestGroup(CProxy_Test mainChare, CProxy_CacheGroup cg1, CProxy_CacheGroup cg2, CProxy_CacheGroup cg3)
+    {
+        if(ok_message) ckout << "Starting processing in group" << endl;
+
+        //CthPrintThreadId(CthSelf());
+        MSA<double, 4096> arr1(cg1);
+        MSA<double, 4096> arr2(cg2);
+        MSA<double, 4096> arr3(cg3);
+
+
+        for(int i = 0; i < 5; i++)
+        {
+            TestWriteOnce(arr1, (double)i*234);
+            TestWriteOnce(arr2, (double)i + 66.23948);
+            TestWriteOnce(arr3, (double)i / 74.8 + 55e33);
+        }
+
+
+        double contribs[] = { 6.72, 4.66, 9.3200, 8.33, 89.434, 11.33 };
+
+        for(unsigned int i = 0 ; i < sizeof(contribs)/sizeof(*contribs); i++)
+        {
+            if(CkMyPe() == 0 && ok_message) ckout << "//////// Iteration " << i << " //////////////" << endl;
+            TestAccumulate(arr1, contribs[i]);
+            TestWriteOnce(arr2);
+            TestAccumulate(arr3, contribs[i]);
+            TestWriteOnce(arr2);
+            TestAccumulate(arr1, contribs[i]);
+            TestAccumulate(arr2, contribs[i]);
+            TestWriteOnce(arr3, 6.0);
+            TestWriteOnce(arr1, 12.0e22);
+        }
+
+        if(ok_message) ckout << "[" << CkMyPe() << "] Done!" << endl;
+        mainChare.doneg();
+    }
+};
+
+#include "Test.def.h"
+
diff --git a/examples/multiphaseSharedArrays/matmul/test.ci b/examples/multiphaseSharedArrays/matmul/test.ci
new file mode 100644 (file)
index 0000000..28f8f83
--- /dev/null
@@ -0,0 +1,17 @@
+// -*- mode: c++; tab-width: 4 -*-
+mainmodule Test
+{
+    extern module DistPageMgr;
+
+    mainchare Test
+    {
+        entry [threaded] Test();
+        entry void done();
+        entry void doneg();
+    }
+
+    group TestGroup
+    {
+        entry [threaded] TestGroup(CProxy_Test mainProxy, CProxy_CacheGroup cg1, CProxy_CacheGroup cg2, CProxy_CacheGroup cg3);
+    }
+}
diff --git a/examples/multiphaseSharedArrays/moldyn/Makefile b/examples/multiphaseSharedArrays/moldyn/Makefile
new file mode 100644 (file)
index 0000000..999487d
--- /dev/null
@@ -0,0 +1,4 @@
+
+HEADERS=nepp.h params.h
+PGM=moldyn
+include ../Makefile_common
diff --git a/examples/multiphaseSharedArrays/moldyn/moldyn.C b/examples/multiphaseSharedArrays/moldyn/moldyn.C
new file mode 100644 (file)
index 0000000..326f8e9
--- /dev/null
@@ -0,0 +1,415 @@
+// -*- mode: c++; tab-width: 4 -*-
+
+// When running 1D, make NEPP = COL1
+// When running 2D, same
+// When running 3D, make NEPP = subset of COL1
+
+#include "nepp.h"
+#include "msa/msa.h"
+
+class XYZ { // coords, forces
+public:
+    double x, y, z;
+    XYZ() { x = y = z = 0.0; }
+    XYZ(const int rhs) { x = y = z = (double)rhs; } // identity value
+    XYZ& operator+= (const XYZ& rhs) { x += rhs.x; y += rhs.y; z += rhs.z; return *this;}
+    XYZ& negate() { x = -x; y=-y; z=-z; return *this;}
+};
+PUPbytes(XYZ);
+
+class AtomInfo {
+public:
+    double mass, charge;
+    AtomInfo() { mass = charge = 0.0; }
+    AtomInfo(const int rhs) { mass = charge = (double)rhs; } // identity value
+    AtomInfo& operator+= (const AtomInfo& rhs) { } // we're not calling accumulate on this
+};
+PUPbytes(AtomInfo);
+
+typedef MSA1D<XYZ, DefaultEntry<XYZ,false>, NEPP> XyzMSA;
+typedef MSA1D<AtomInfo, DefaultEntry<AtomInfo,false>, NEPP> AtomInfoMSA;
+typedef MSA2D<bool, DefaultEntry<bool,false>, NEPP, MSA_ROW_MAJOR> NeighborMSA;
+
+#include "moldyn.decl.h"
+
+#include <assert.h>
+#include <math.h>
+#include "params.h"
+
+const double epsilon = 0.00000001;
+inline int notequal(double v1, double v2)
+{
+    return (fabs(v1 - v2) > epsilon);
+}
+
+class moldyn : public CBase_moldyn
+{
+protected:
+    double start_time;
+    CProxy_WorkerArray workers;
+    int reallyDone;
+
+public:
+    moldyn(CkArgMsg* m)
+    {
+        // Usage: a.out [number_of_worker_threads [max_bytes]]
+        if(m->argc >1 ) NUM_WORKERS=atoi(m->argv[1]);
+        if(m->argc >2 ) NUM_ATOMS=atoi(m->argv[2]);
+        if(m->argc >3 ) CACHE_SIZE_BYTES=atoi(m->argv[3]);
+        if(m->argc >4 ) detailedTimings= ((atoi(m->argv[4])!=0)?true:false) ; // 1D, 2D, 3D
+        delete m;
+        reallyDone = 0;
+
+        XyzMSA coords(NUM_ATOMS, NUM_WORKERS, CACHE_SIZE_BYTES);
+        XyzMSA forces(NUM_ATOMS, NUM_WORKERS, CACHE_SIZE_BYTES);
+        AtomInfoMSA atominfo(NUM_ATOMS, NUM_WORKERS, CACHE_SIZE_BYTES);
+        NeighborMSA nbrList(NUM_ATOMS, NUM_ATOMS, NUM_WORKERS, CACHE_SIZE_BYTES);
+
+        workers = CProxy_WorkerArray::ckNew(coords, forces, atominfo, nbrList, NUM_WORKERS, NUM_WORKERS);
+        workers.ckSetReductionClient(new CkCallback(CkIndex_moldyn::done(NULL), thisProxy));
+
+        start_time = CkWallTimer();
+        workers.Start();
+    }
+
+    // This method gets called twice, and should only terminate the
+    // second time.
+    void done(CkReductionMsg* m)
+    {
+        int *ip = (int*)m->getData();
+        bool prefetchWorked = (*ip==0);
+        delete m;
+
+        if (reallyDone == 0) {
+            workers.Kontinue();
+            reallyDone++;
+
+            double end_time = CkWallTimer();
+
+            const char TAB = '\t';
+
+            char hostname[100];
+            gethostname(hostname, 100);
+
+            ckout << CkNumPes() << TAB
+                  << NUM_WORKERS << TAB
+                  << "nepp " << NEPP << TAB
+                                 << "atom " << NUM_ATOMS << TAB
+                  << end_time - start_time << TAB
+                  << CACHE_SIZE_BYTES << TAB
+                  << (runPrefetchVersion? (prefetchWorked?"Y":"N"): "U") << " "
+                  << hostname
+                  << endl;
+
+        } else {
+            CkExit();
+        }
+    }
+};
+
+// Returns start and end
+void GetMyIndices(unsigned int maxIndex, unsigned int myNum, unsigned int numWorkers,
+                  unsigned int& start, unsigned int& end)
+{
+    int rangeSize = maxIndex / numWorkers;
+    if(myNum < maxIndex % numWorkers)
+    {
+        start = myNum * (rangeSize + 1);
+        end = start + rangeSize;
+    }
+    else
+    {
+        start = myNum * rangeSize + maxIndex % numWorkers;
+        end = start + rangeSize - 1;
+    }
+}
+
+class WorkerArray : public CBase_WorkerArray
+{
+private:
+    // prefetchWorked keeps track of whether the prefetches succeeded or not.
+    bool prefetchWorked;
+    CkVec<double> times;
+    CkVec<const char*> description;
+
+    // ================================================================
+    // 2D calculations
+
+    inline int numWorkers2D() {
+        static int n = 0;
+
+        if (n==0) {
+            n = (int)(sqrt(numWorkers));
+            CkAssert(n*n == numWorkers);
+        }
+
+        return n;
+    }
+
+    // Convert a 1D ChareArray index into a 2D x dimension index
+    inline unsigned int toX() {
+        return thisIndex/numWorkers2D();
+    }
+    // Convert a 1D ChareArray index into a 2D y dimension index
+    inline unsigned int toY() {
+        return thisIndex%numWorkers2D();
+    }
+
+    // ================================================================
+
+protected:
+    XyzMSA coords;
+    XyzMSA forces;
+    AtomInfoMSA atominfo;
+    NeighborMSA nbrList;
+
+    unsigned int numAtoms, numWorkers;
+
+    void EnrollArrays()
+    {
+        coords.enroll(numWorkers);
+        forces.enroll(numWorkers);
+        atominfo.enroll(numWorkers);
+        nbrList.enroll(numWorkers);
+    }
+
+    void SyncArrays()
+    {
+        coords.sync();
+        forces.sync();
+        atominfo.sync();
+        nbrList.sync();
+    }
+
+    void FillArrays()
+    {
+        /*
+        // fill in our portion of the array
+        unsigned int rowStart, rowEnd, colStart, colEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+        GetMyIndices(cols2, thisIndex, numWorkers, colStart, colEnd);
+
+        // fill them in with 1
+        for(unsigned int r = rowStart; r <= rowEnd; r++)
+            for(unsigned int c = 0; c < cols1; c++)
+                arr1.set(r, c) = 1.0;
+
+        for(unsigned int c = colStart; c <= colEnd; c++)
+            for(unsigned int r = 0; r < rows2; r++)
+                arr2.set(r, c) = 1.0;
+        */
+    }
+
+    XYZ calculateForce(const XYZ &coordsi, const AtomInfo &atominfoi, const XYZ &coordsj, const AtomInfo &atominfoj)
+    {
+        XYZ result;
+        return result;
+    }
+
+    XYZ integrate(const AtomInfo &atominfok, const XYZ &forcesk)
+    {
+        XYZ result;
+        return result;
+    }
+
+    double distance(unsigned int i, unsigned int j)
+    {
+        return 0;
+    }
+
+    // ================================================================
+    // 2D calculations
+
+    inline int numWorkers2D() {
+        static int n = 0;
+
+        if (n==0) {
+            n = (int)(sqrt(numWorkers));
+            CkAssert(n*n == numWorkers);
+        }
+
+        return n;
+    }
+
+    // Convert a 1D ChareArray index into a 2D x dimension index
+    inline unsigned int toX() {
+        return thisIndex/numWorkers2D();
+    }
+    // Convert a 1D ChareArray index into a 2D y dimension index
+    inline unsigned int toY() {
+        return thisIndex%numWorkers2D();
+    }
+
+    // ================================================================
+
+    void DoWork()
+    {
+        unsigned int i_start, i_end, j_start, j_end;
+        GetMyIndices(NUM_ATOMS-1, toX(), numWorkers2D(), i_start, i_end);
+        GetMyIndices(NUM_ATOMS-1, toY(), numWorkers2D(), j_start, j_end);
+
+        for (unsigned int timestep = 0; timestep < NUM_TIMESTEPS; timestep++) {
+            /**************** Phase I ****************/
+            // for a section of the interaction matrix
+            for (unsigned int i = i_start; i< i_end; i++)
+                for (unsigned int j = j_start; j< j_end; j++)
+                    if (nbrList.get(i,j)) { // nbrlist enters ReadOnly mode
+                        XYZ force = calculateForce(coords[i],
+                                                   atominfo[i],
+                                                   coords[j],
+                                                   atominfo[j]);
+                        forces.accumulate(i,  force); // Accumulate mode
+                        forces.accumulate(j, force.negate());
+                    }
+            forces.sync();
+
+            /**************** Phase II ****************/
+            unsigned int myAtomsBegin, myAtomsEnd;
+            for (unsigned int k = myAtomsBegin; k<myAtomsEnd; k++)
+                coords.set(k) = integrate(atominfo[k], forces[k]); // WriteOnly mode
+            coords.sync();
+
+            /**************** Phase III ****************/
+            if  (timestep %8 == 0) { // update neighbor list every 8 steps
+                for (unsigned int i = i_start; i< i_end; i++)
+                    for (unsigned int j = j_start; j< j_end; j++)
+                        if (distance(i, j) < CUTOFF_DISTANCE) {
+                            nbrList.set(i,j) = true;
+                            nbrList.set(j,i) = true;
+                        } else {
+                            nbrList.set(i,j) = false;
+                            nbrList.set(j,i) = false;
+                        }
+                nbrList.sync();
+            }
+        }
+    }
+
+    void TestResults(bool prod_test=true)
+    {
+        /*
+        int errors = 0;
+        bool ok=true;
+
+        // verify the results, print out first error only
+        ok=true;
+        for(unsigned int r = 0; ok && r < rows1; r++) {
+            for(unsigned int c = 0; ok && c < cols1; c++) {
+                if(notequal(arr1.get(r, c), 1.0)) {
+                    ckout << "[" << CkMyPe() << "," << thisIndex << "] arr1 -- Illegal element at (" << r << "," << c << ") " << arr1.get(r,c) << endl;
+                    ok=false;
+                    errors++;
+                }
+            }
+        }
+
+        ok=true;
+        for(unsigned int c = 0; ok && c < cols2; c++) {
+            for(unsigned int r = 0; ok && r < rows2; r++) {
+                if(notequal(arr2.get(r, c), 1.0)) {
+                    ckout << "[" << CkMyPe() << "," << thisIndex << "] arr2 -- Illegal element at (" << r << "," << c << ") " << arr2.get(r,c) << endl;
+                    ok=false;
+                    errors++;
+                }
+            }
+        }
+
+        //arr1.FreeMem();
+        //arr2.FreeMem();
+
+        if(prod_test)
+        {
+            ok = true;
+            for(unsigned int c = 0; ok && c < cols2; c++) {
+                for(unsigned int r = 0; ok && r < rows1; r++) {
+                    if(notequal(prod.get(r,c), 1.0 * cols1)) {
+                        ckout << "[" << CkMyPe() << "] result  -- Illegal element at (" << r << "," << c << ") " << prod.get(r,c) << endl;
+                        ok=false;
+                        errors++;
+                    }
+                }
+            }
+        }
+
+        if (errors!=0) CkAbort("Incorrect array elements detected!");
+        */
+    }
+
+    void Contribute()
+    {
+        int dummy = prefetchWorked?0:1;
+        contribute(sizeof(int), &dummy, CkReduction::sum_int);
+    }
+
+public:
+    WorkerArray(const XyzMSA &coords_, const XyzMSA &forces_, AtomInfoMSA &atominfo_,
+                NeighborMSA &nbrList_, unsigned int numWorkers_)
+        : coords(coords_), forces(forces_), atominfo(atominfo_), nbrList(nbrList_),
+          numWorkers(numWorkers_), prefetchWorked(false), numAtoms(coords.length())
+    {
+        // ckout << "w" << thisIndex << ":" << rows1 << " " << cols1 << " " << cols2 << endl;
+        times.push_back(CkWallTimer());
+        description.push_back("constr");
+    }
+
+    WorkerArray(CkMigrateMessage* m) {}
+
+    ~WorkerArray()
+    {
+    }
+
+    void Start()
+    {
+        times.push_back(CkWallTimer()); // 1
+        description.push_back("   start");
+
+        EnrollArrays();
+        times.push_back(CkWallTimer()); // 2
+        description.push_back("   enroll");
+
+        if(verbose) ckout << thisIndex << ": filling" << endl;
+        FillArrays();
+        times.push_back(CkWallTimer()); // 3
+        description.push_back("  fill");
+
+        if(verbose) ckout << thisIndex << ": syncing" << endl;
+        SyncArrays();
+        times.push_back(CkWallTimer()); // 4
+        description.push_back("    sync");
+
+        if (do_test) TestResults(0);
+
+        if(verbose) ckout << thisIndex << ": product" << endl;
+        DoWork();
+        times.push_back(CkWallTimer()); // 5
+        description.push_back("    work");
+
+        Contribute();
+    }
+
+    void Kontinue()
+    {
+        times.push_back(CkWallTimer()); // 6
+        description.push_back("    redn");
+
+        if(verbose) ckout << thisIndex << ": testing" << endl;
+        if (do_test) TestResults();
+        times.push_back(CkWallTimer()); // 7
+        description.push_back("    test");
+        Contribute();
+
+        if (detailedTimings) {
+            if (thisIndex == 0) {
+                for(int i=1; i<description.length(); i++)
+                    ckout << description[i] << " ";
+                ckout << endl;
+            }
+            ckout << "w" << thisIndex << ":";
+            for(int i=1; i<times.length(); i++)
+                ckout << times[i]-times[i-1] << " ";
+            ckout << endl;
+        }
+    }
+};
+
+#include "moldyn.def.h"
diff --git a/examples/multiphaseSharedArrays/moldyn/moldyn.ci b/examples/multiphaseSharedArrays/moldyn/moldyn.ci
new file mode 100644 (file)
index 0000000..6013226
--- /dev/null
@@ -0,0 +1,31 @@
+// -*- mode: c++; tab-width: 4 -*-
+mainmodule moldyn
+{
+    readonly bool detailedTimings;
+
+    mainchare moldyn
+    {
+        entry void moldyn(CkArgMsg*);
+        entry void done(CkReductionMsg*);
+    };
+
+    array[1D] WorkerArray
+    {
+        entry void WorkerArray(XyzMSA coords_, XyzMSA forces_,
+                               AtomInfoMSA atominfo_, NeighborMSA nbrList_,
+                               unsigned int numWorkers);
+        entry [threaded] void Start();
+        entry [threaded] void Kontinue();
+    };
+
+    /* Currently, you must explicitly instantiate any
+       MSA templates that you use. */
+    group MSA_CacheGroup<XYZ, DefaultEntry<XYZ,false>, NEPP>;
+    array [1D] MSA_PageArray<XYZ, DefaultEntry<XYZ,false>, NEPP>;
+
+    group MSA_CacheGroup<AtomInfo, DefaultEntry<AtomInfo,false>, NEPP>;
+    array [1D] MSA_PageArray<AtomInfo, DefaultEntry<AtomInfo,false>, NEPP>;
+
+    group MSA_CacheGroup<bool, DefaultEntry<bool,false>, NEPP>;
+    array [1D] MSA_PageArray<bool, DefaultEntry<bool,false>, NEPP>;
+};
diff --git a/examples/multiphaseSharedArrays/moldyn/nepp.h b/examples/multiphaseSharedArrays/moldyn/nepp.h
new file mode 100644 (file)
index 0000000..0d8fa3a
--- /dev/null
@@ -0,0 +1,3 @@
+#define NEPP 500
+
+
diff --git a/examples/multiphaseSharedArrays/moldyn/params.h b/examples/multiphaseSharedArrays/moldyn/params.h
new file mode 100644 (file)
index 0000000..378d13b
--- /dev/null
@@ -0,0 +1,14 @@
+unsigned int NUM_WORKERS = 2;
+unsigned int CACHE_SIZE_BYTES = 1024*1024;
+unsigned int NUM_ATOMS = 100;
+unsigned int NUM_TIMESTEPS = 10;
+double CUTOFF_DISTANCE = 1;
+bool detailedTimings = false;
+
+// Run the version without prefetching
+const bool runPrefetchVersion=false;
+
+// debugging
+const bool verbose = false;
+const bool do_test = false;  // If true, tests results, etc.
+
diff --git a/examples/multiphaseSharedArrays/simpleTestVarsize/Makefile b/examples/multiphaseSharedArrays/simpleTestVarsize/Makefile
new file mode 100644 (file)
index 0000000..ba6d26c
--- /dev/null
@@ -0,0 +1,7 @@
+
+HEADERS=params.h
+PGM=t3
+include ../Makefile_common
+
+tv: testV.C
+       $(CHARMC) -o tv testV.C
diff --git a/examples/multiphaseSharedArrays/simpleTestVarsize/params.h b/examples/multiphaseSharedArrays/simpleTestVarsize/params.h
new file mode 100644 (file)
index 0000000..aed90f8
--- /dev/null
@@ -0,0 +1,6 @@
+unsigned int bytes = 1024*1024;
+const unsigned int ROWS1 = 100;
+const unsigned int COLS1 = 500;
+const unsigned int COLS2 = 100;
+const unsigned int ROWS2 = COLS1;
+unsigned int NUM_WORKERS = 2;
diff --git a/examples/multiphaseSharedArrays/simpleTestVarsize/t3.C b/examples/multiphaseSharedArrays/simpleTestVarsize/t3.C
new file mode 100644 (file)
index 0000000..3d4cf8c
--- /dev/null
@@ -0,0 +1,389 @@
+// -*- mode: c++; tab-width: 4 -*-
+#include "msa/msa.h"
+class Double;
+typedef MSA2D<Double, DefaultEntry<Double,true>, MSA_DEFAULT_ENTRIES_PER_PAGE, MSA_ROW_MAJOR> MSA2DRM;
+
+#include "t3.decl.h"
+
+#include <assert.h>
+#include <math.h>
+#include "params.h"
+
+#define NO_PREFETCH
+int g_prefetch = -1;
+
+// debugging
+#define XVAL 49
+#define YVAL 76
+const int do_message = 0;
+
+const double epsilon = 0.00000001;
+class Double {
+    int getNumElements() {
+        int i=0;
+        Double *iter = this;
+        while(iter!=0) {
+            i++;
+            iter = iter->next;
+        }
+        return i;
+    }
+
+public:
+    double data;
+    Double *next;
+
+    // required
+    Double()
+    {
+        data = 0.0;
+        next = 0;
+    }
+
+    // optional, but recommended for user's code.
+    // copy constructor
+    //
+       // Differs from copy assignment because cc deals with
+       // unallocated memory, but ca deals with a constructed object.
+    Double(const Double &rhs)
+    {
+        ckout << "reached copy" << endl;
+//         data = rhs.data;
+        next = 0;
+
+        // call assignment operator
+        *this = rhs;
+    }
+
+    ~Double()
+    {
+//         cout << "reached destructor" << endl;
+        delete next;
+    }
+
+    // required
+    // assignment operator
+    Double& operator= (const Double& rhs)
+    {
+//         ckout << "reached assign" << endl;
+        if (this == &rhs) return *this;  // self-assignment
+
+        if (next != 0) {
+            delete next;
+            next = 0;
+        }
+
+        Double *iter1 = this;
+        const Double *iter2 = &rhs;
+        while (iter2 != 0) {
+            iter1->data = iter2->data;
+            if (iter2->next != 0)
+                iter1->next = new Double();
+            iter2 = iter2->next;
+            iter1 = iter1->next;
+        }
+
+        return *this;
+    }
+
+    // required for accumulate
+    // += operator
+    // @@ what if rhs is a sequence.  Do we want to prepend the entire sequence to this Double?
+    Double& operator+= (const Double& rhs)
+    {
+        if (rhs.data == 0) // identity
+            return *this;
+        else if (this->data == 0) {
+            *this = rhs;
+            return *this;
+        }
+
+        Double *last = this;
+        Double *iter = this->next;
+        while(iter!=0) {
+            last = iter;
+            iter = iter->next;
+        }
+
+        Double *tmp = new Double();
+        last->next = tmp;
+        *tmp = rhs; // use the assign operator to do the work.
+
+        return *this;
+    }
+
+    // required for accumulate
+    // typecast from int
+    Double(const int rhs) : data(rhs), next(0)
+    {
+//         ckout << "reached typecast from int" << next << endl;
+    }
+
+    // required
+    // pup
+    virtual void pup(PUP::er &p){
+//         static int called = 0;
+//         called++;
+//         ckout << "p" << CkMyPe() << ":" << "reached pup " << called << endl;
+
+        if(false) { //simple pup
+            p | data;
+        } else {
+            int n;
+            if (p.isPacking())
+                n = getNumElements();
+            p|n;
+
+            Double *iter = this;
+            if (p.isUnpacking()) {
+                CkAssert(0 == next);
+                while (n>0) {
+                    p|(iter->data);
+                    n--;
+                    if (n>0)
+                        iter->next = new Double();
+                    iter = iter->next;
+                }
+            } else {
+                while(iter!=0) {
+                    p|(iter->data);
+                    iter = iter->next;
+                }
+            }
+        }
+    }
+
+    // optional
+    // typecast Double from/to double, for convenience
+    Double(const double &rhs) : data(rhs), next(0) {}
+//     operator double() { return data; }
+//     operator double const () { return (const double) data; }
+};
+
+// optional
+// convenience function
+ostream& operator << (ostream& os, const Double& s) {
+    os << s.data;
+    if (s.next!=0)
+        os << *(s.next);
+    return os;
+}
+
+// optional
+// convenience function
+CkOutStream& operator << (CkOutStream& os, const Double& s) {
+    os << s.data;
+    if (s.next!=0)
+        os << " " << *(s.next);
+    return os;
+}
+
+inline int notequal(double v1, double v2)
+{
+    return (fabs(v1 - v2) > epsilon);
+}
+
+inline int notequal(Double v1, Double v2)
+{
+    if (notequal(v1.data, v2.data))
+        return 1;
+    else if (v1.next!=0 && v2.next!=0)
+        return notequal(*v1.next, *v2.next);
+    else 
+        return !(v1.next == v2.next);
+}
+
+class t3 : public CBase_t3
+{
+protected:
+    double start_time;
+    CProxy_TestArray workers;
+    int reallyDone;
+
+public:
+    t3(CkArgMsg* m)
+    {
+        // Usage: t3 [number_of_worker_threads [max_bytes]]
+        if(m->argc >1 ) NUM_WORKERS=atoi(m->argv[1]);
+        if(m->argc >2 ) bytes=atoi(m->argv[1]);
+        delete m;
+        reallyDone = 0;
+
+        // Actually build the shared array.
+        MSA2DRM arr1(ROWS1, COLS1, NUM_WORKERS, bytes);
+
+        workers = CProxy_TestArray::ckNew(arr1, NUM_WORKERS, NUM_WORKERS);
+        workers.ckSetReductionClient(new CkCallback(CkIndex_t3::done(NULL), thisProxy));
+
+        start_time = CkWallTimer();
+        workers.Start();
+    }
+
+    void done(CkReductionMsg* m)
+    {
+        delete m;
+
+        if (reallyDone == 0) {
+            workers.Kontinue();
+            reallyDone++;
+        } else {
+            double end_time = CkWallTimer();
+
+            const char TAB = '\t';
+
+        ckout << ROWS1 << TAB
+              << COLS1 << TAB
+              << NUM_WORKERS << TAB
+              << bytes << TAB
+              << ((g_prefetch == 0) ? "N" : ((g_prefetch == 1) ? "Y" : "U")) << TAB
+              << end_time - start_time
+              << endl;
+
+            CkExit();
+        }
+    }
+};
+
+// get the chunk for a given index
+int GetChunkForIndex(int index, int maxIndex, int numWorkers)
+{
+    int rangeSize = maxIndex / numWorkers;
+    int chunk;
+
+    // find which chare is going to process the current node
+    if(index <= (maxIndex % numWorkers) * (rangeSize + 1) - 1)
+        chunk = index/(rangeSize + 1);
+    else
+        chunk = maxIndex%numWorkers + (index - (maxIndex%numWorkers) * (rangeSize + 1))/rangeSize;
+
+    return chunk;
+}
+
+void GetMyIndices(unsigned int maxIndex, unsigned int myNum, unsigned int numWorkers, unsigned int& start, unsigned int& end)
+{
+    int rangeSize = maxIndex / numWorkers;
+    if(myNum < maxIndex % numWorkers)
+    {
+        start = myNum * (rangeSize + 1);
+        end = start + rangeSize;
+    }
+    else
+    {
+        start = myNum * rangeSize + maxIndex % numWorkers;
+        end = start + rangeSize - 1;
+    }
+}
+
+class TestArray : public CBase_TestArray
+{
+protected:
+    MSA2DRM arr1;       // row major
+
+    unsigned int rows1, cols1, numWorkers;
+
+    void FillArray()
+    {
+        // fill in our portion of the array
+        unsigned int rowStart, rowEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+
+        // fill them in with 1
+        for(unsigned int r = rowStart; r <= rowEnd; r++)
+            for(unsigned int c = 0; c < cols1; c++)
+                arr1.set(r, c) = 1.0;
+
+    }
+
+    void SyncArrays()
+    {
+        arr1.sync();
+    }
+
+    void FindProduct()
+    {
+        arr1.accumulate(arr1.getIndex(0,0), 2.0 + thisIndex);
+        arr1.accumulate(arr1.getIndex(0,0), 100.0 + thisIndex);
+    }
+
+    void TestResults()
+    {
+        int error1 = 0, error2 = 0, error3=0;
+
+        // verify the results
+        int msg = 1;
+        int cnt = 0;
+        for(unsigned int r = 0; r < rows1; r++)
+        {
+            for(unsigned int c = 0; c < cols1; c++)
+            {
+                if(msg && notequal(arr1.get(r, c).data, 1.0))
+                {
+                    ckout << "p" << CkMyPe() << "w" << thisIndex << " arr1 -- Illegal element at (" << r << "," << c << ") " << arr1.get(r,c) << endl;
+                    ckout << "Skipping rest of TestResults." << endl;
+                    msg = 0;
+                    error1 = 1;
+                }
+            }
+        }
+
+        if(do_message) ckout << "w" << thisIndex << ": Testing done.  Result = "
+                             << ((error1 || error2 || error3)?"Failure":"SUCCESS")
+                             << endl;
+    }
+
+    void Contribute()
+    {
+        int dummy = 0;
+        contribute(sizeof(int), &dummy, CkReduction::max_int);
+    }
+
+public:
+    TestArray(const MSA2DRM &arr_, unsigned int numWorkers_)
+    : arr1(arr_), rows1(arr1.getRows()), cols1(arr1.getCols()), numWorkers(numWorkers_)
+    {
+    }
+
+    TestArray(CkMigrateMessage* m) {}
+
+    ~TestArray()
+    {
+    }
+
+    // threaded EP
+    void Start()
+    {
+        arr1.enroll(numWorkers); // barrier
+        if(do_message) ckout << "w" << thisIndex << ": filling" << endl;
+        FillArray();
+        if(do_message) ckout << "w" << thisIndex << ":value " << arr1.get(XVAL,YVAL) << "," << arr1.get(XVAL,YVAL+1)  << endl;
+//         (arr1.getCacheGroup()).emitBufferValue(6, 0);
+        if(do_message) ckout << "w" << thisIndex << ": syncing" << endl;
+        SyncArrays();
+//         if (thisIndex == 0) (arr1.getCacheGroup()).emit(0);
+//         if(do_message) ckout << "w" << thisIndex << ":value2 " << arr1.get(XVAL,YVAL) << "," << arr1.get(XVAL,YVAL+1)  << endl;
+        Contribute();
+    }
+
+    void Kontinue()
+    {
+//         if(do_message) ckout << "w" << thisIndex << ":value3 " << arr1.get(XVAL,YVAL) << "," << arr1.get(XVAL,YVAL+1)  << endl;
+        if(do_message) ckout << thisIndex << ": testing after fillarray, sync, and redn" << endl;
+        TestResults();
+        SyncArrays();
+
+        if(do_message) ckout << thisIndex << ": producting" << endl;
+        FindProduct();
+        SyncArrays();
+
+//         if(do_message) ckout << thisIndex << ": tetsing after product" << endl;
+//      TestResults();
+
+        // Print out the accumulated element.
+        ckout << "p" << CkMyPe() << "w" << thisIndex << ":" << arr1.get(0,0) << endl;
+        SyncArrays();
+
+        Contribute();
+    }
+};
+
+#include "t3.def.h"
diff --git a/examples/multiphaseSharedArrays/simpleTestVarsize/t3.ci b/examples/multiphaseSharedArrays/simpleTestVarsize/t3.ci
new file mode 100644 (file)
index 0000000..01cf1c1
--- /dev/null
@@ -0,0 +1,21 @@
+// -*- mode: c++; tab-width: 4 -*-
+mainmodule t3
+{
+    mainchare t3
+    {
+        entry void t3(CkArgMsg*);
+        entry void done(CkReductionMsg*);
+    };
+
+    array[1D] TestArray
+    {
+        entry void TestArray(MSA2DRM arr_, unsigned int numWorkers_);
+        entry [threaded] void Start();
+        entry [threaded] void Kontinue();
+    };
+
+    /* Currently, you must explicitly instantiate any
+       MSA templates that you use. */
+    group MSA_CacheGroup<Double, DefaultEntry<Double,true>, MSA_DEFAULT_ENTRIES_PER_PAGE>;
+    array [1D] MSA_PageArray<Double, DefaultEntry<Double,true>, MSA_DEFAULT_ENTRIES_PER_PAGE>;
+};
diff --git a/examples/multiphaseSharedArrays/simpleTestVarsize/testV.C b/examples/multiphaseSharedArrays/simpleTestVarsize/testV.C
new file mode 100644 (file)
index 0000000..587d831
--- /dev/null
@@ -0,0 +1,141 @@
+#include<iostream>
+#include<stdio.h>
+#include<assert.h>
+
+using namespace std;
+
+class Double {
+    int getNumElements() {
+        int i=0;
+        Double *iter = this;
+        while(iter!=0) {
+            i++;
+            iter = iter->next;
+        }
+        return i;
+    }
+
+public:
+    double data;
+    Double *next;
+
+    Double()
+    {
+        data = 0.0;
+        next = 0;
+    }
+
+    // copy constructor
+    //
+       // Differs from copy assignment because cc deals with
+       // unallocated memory, but ca deals with a constructed object.
+    Double(const Double &rhs)
+    {
+        data = rhs.data;
+        next = 0;
+
+        // call assignment operator
+        *this = rhs;
+    }
+
+    ~Double()
+    {
+//         cout << "reached destructor" << endl;
+        delete next;
+    }
+
+    // assignment operator
+    Double& operator= (const Double& rhs)
+    {
+        if (this == &rhs) return *this;  // self-assignment
+
+        if (next != 0) {
+            delete next;
+            next = 0;
+        }
+
+        Double *iter1 = this;
+        const Double *iter2 = &rhs;
+        while (iter2 != 0) {
+            iter1->data = iter2->data;
+            if (iter2->next != 0)
+                iter1->next = new Double();
+            iter2 = iter2->next;
+            iter1 = iter1->next;
+        }
+
+        return *this;
+    }
+
+    // += operator
+    // @@ what if rhs is a sequence.  Do we want to prepend the entire sequence to this Double?
+    Double& operator+= (const Double& rhs)
+    {
+        Double *tmp = new Double(data);
+        tmp->next = next;
+        data = rhs.data;
+        next = tmp;
+
+        return *this;
+    }
+
+    // typecast from int
+    Double(const int rhs) : data(rhs), next(0)
+    {
+//         cout << "reached typecast from int" << endl;
+    }
+
+    // pup
+//     virtual void pup(PUP::er &p){
+//         int n = getNumElements();
+//         p|n;
+
+//         Double *iter = this;
+//         if (p.isUnpacking()) {
+//             while (n>0) {
+//                 p|(iter->data);
+//                 n--;
+//                 if (n>0)
+//                     iter->next = new Double();
+//                 iter = iter->next;
+//             }
+//         } else {
+//             while(iter!=0) {
+//                 p|(iter->data);
+//                 iter = iter->next;
+//             }
+//         }
+//     }
+
+    // typecast Double from/to double, for convenience
+    Double(const double &rhs) : data(rhs), next(0) {}
+//     operator double() { return data; }
+//     operator double const () { return (const double) data; }
+};
+
+// convenience function
+ostream& operator << (ostream& os, const Double& s) {
+    os << s.data;
+    if (s.next!=0)
+        os << " " << *(s.next);
+    return os;
+}
+
+int
+main()
+{
+    Double d;  // default constructor
+    cout << d << endl;
+    Double e(1); // typecast from int
+    d += e; // +=
+    cout << d << endl;
+    d += d; // prepends only the head
+    cout << "d=" << d << endl;
+    Double *f = new Double(d); // =
+    cout << "f=" << *f << endl;  // destructor
+    delete f;
+    for(int i=3; i<200; i++)
+        d += i;
+    cout << d << endl;
+    cout << "the end" << endl;
+}
diff --git a/examples/multiphaseSharedArrays/simpletest/Makefile b/examples/multiphaseSharedArrays/simpletest/Makefile
new file mode 100644 (file)
index 0000000..bf8a3a5
--- /dev/null
@@ -0,0 +1,4 @@
+
+HEADERS=params.h 
+PGM=t3
+include ../Makefile_common
diff --git a/examples/multiphaseSharedArrays/simpletest/params.h b/examples/multiphaseSharedArrays/simpletest/params.h
new file mode 100644 (file)
index 0000000..d824b38
--- /dev/null
@@ -0,0 +1,4 @@
+unsigned int bytes = 100*1024*1024;
+const unsigned int ROWS1 = 2000;
+const unsigned int COLS1 = 2000;
+unsigned int NUM_WORKERS = 2;
diff --git a/examples/multiphaseSharedArrays/simpletest/run.sh b/examples/multiphaseSharedArrays/simpletest/run.sh
new file mode 100755 (executable)
index 0000000..1195a0a
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+# Shell script to test for multiple test cases
+
+touch outputs
+for rows1 in 1000 5000 10000; do
+  for cols1 in 500 750 1000; do
+    for cols2 in 1000 5000 10000; do
+      for mbytes in 128 64 32 16 8 4 2 1; do
+       for num_workers in 1 2 4 8 16 32; do
+          rm -rf params.h
+          printf "const unsigned int bytes = %d*1024*1024;\n" $mbytes >> params.h
+          printf "const unsigned int ROWS1 = %d;\n" $rows1 >> params.h
+          printf "const unsigned int COLS1 = %d;\n" $cols1 >> params.h
+          printf "const unsigned int COLS2 = %d;\n" $cols2 >> params.h
+          printf "const unsigned int ROWS2 = COLS1;\n" >> params.h
+          printf "const unsigned int NUM_WORKERS = %d;\n" $num_workers >> params.h
+          printf "\n" >> params.h
+  
+          rm -f t3
+          make OPTS=-O3 -s
+          for num_pes in 4 8 16 32; do
+            ./charmrun t3 +p$num_pes >> outputs
+          done
+        done
+      done
+    done
+  done
+done
diff --git a/examples/multiphaseSharedArrays/simpletest/t3.C b/examples/multiphaseSharedArrays/simpletest/t3.C
new file mode 100644 (file)
index 0000000..e7f3bb5
--- /dev/null
@@ -0,0 +1,178 @@
+// -*- mode: c++; tab-width: 4 -*-
+#include "msa/msa.h"
+
+typedef MSA2D<double, DefaultEntry<double>, MSA_DEFAULT_ENTRIES_PER_PAGE, MSA_ROW_MAJOR> MSA2DRM;
+
+#include "t3.decl.h"
+
+#include <assert.h>
+#include <math.h>
+#include "params.h"
+
+#define NO_PREFETCH
+int g_prefetch = -1;
+
+// debugging
+const int do_message = 0;
+
+const double epsilon = 0.00000001;
+inline int notequal(double v1, double v2)
+{
+    return (fabs(v1 - v2) > epsilon);
+}
+
+class t3 : public CBase_t3
+{
+protected:
+    double start_time;
+    CProxy_TestArray workers;
+    int reallyDone;
+
+public:
+    t3(CkArgMsg* m)
+    {
+        // Usage: t3 [number_of_worker_threads [max_bytes]]
+        if(m->argc >1 ) NUM_WORKERS=atoi(m->argv[1]);
+        if(m->argc >2 ) bytes=atoi(m->argv[1]);
+        delete m;
+        reallyDone = 0;
+
+        // Actually build the shared array.
+        MSA2DRM arr1(ROWS1, COLS1, NUM_WORKERS, bytes);
+
+        workers = CProxy_TestArray::ckNew(arr1, NUM_WORKERS, NUM_WORKERS);
+        workers.ckSetReductionClient(new CkCallback(CkIndex_t3::done(NULL), thisProxy));
+
+        start_time = CkWallTimer();
+        workers.Start();
+    }
+
+    void done(CkReductionMsg* m)
+    {
+        delete m;
+
+        if (reallyDone == 0) {
+            workers.Kontinue();
+            reallyDone++;
+            double end_time = CkWallTimer();
+
+            const char TAB = '\t';
+
+        ckout << ROWS1 << TAB
+              << COLS1 << TAB
+              << NUM_WORKERS << TAB
+              << bytes << TAB
+              << ((g_prefetch == 0) ? "N" : ((g_prefetch == 1) ? "Y" : "U")) << TAB
+              << end_time - start_time
+              << endl;
+        } else {
+
+            CkExit();
+        }
+    }
+};
+
+
+void GetMyIndices(unsigned int maxIndex, unsigned int myNum, unsigned int numWorkers, unsigned int& start, unsigned int& end)
+{
+    int rangeSize = maxIndex / numWorkers;
+    start=myNum*rangeSize;
+    end=(myNum+1)*rangeSize;
+    if (myNum==numWorkers-1) end=maxIndex;
+
+/*  // I don't understand what this is trying to do:
+    if(myNum < maxIndex % numWorkers)
+    {
+        start = myNum * (rangeSize + 1);
+        end = start + rangeSize;
+    }
+    else
+    {
+        start = myNum * rangeSize + maxIndex % numWorkers;
+        end = start + rangeSize - 1;
+    }
+*/
+}
+
+class TestArray : public CBase_TestArray
+{
+protected:
+    MSA2DRM arr1;       // row major
+
+    unsigned int rows1, cols1, numWorkers;
+
+    void FillArray()
+    {
+        // fill in our portion of the array
+        unsigned int rowStart, rowEnd;
+        GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
+
+        // fill them in with 1
+        for(unsigned int r = rowStart; r < rowEnd; r++)
+            for(unsigned int c = 0; c < cols1; c++)
+                arr1.set(r, c) = 1.0;
+    }
+
+    void SyncArrays()
+    {
+        arr1.sync();
+    }
+
+    void TestResults()
+    {
+        int errors = 0;
+
+        // verify the results
+        for(unsigned int r = 0; r < rows1; r++)
+        {
+            bool warnedRow=false;
+            for(unsigned int c = 0; c < cols1; c++)
+            {
+                if((!warnedRow) && notequal(arr1.get(r, c), 1.0))
+                {
+                    ckout << "p" << CkMyPe() << "w" << thisIndex << " arr1 -- Illegal element at (" << r << "," << c << ") " << arr1.get(r,c) << endl;
+                    errors++;
+                    warnedRow=true;
+                }
+            }
+        }
+
+        if (errors) CkAbort("Incorrect array elements detected!");
+    }
+
+    void Contribute()
+    {
+        int dummy = 0;
+        contribute(sizeof(int), &dummy, CkReduction::max_int);
+    }
+
+public:
+    TestArray(const MSA2DRM &arr_, unsigned int numWorkers_)
+    : arr1(arr_), rows1(arr1.getRows()), cols1(arr1.getCols()), numWorkers(numWorkers_)
+    {
+    }
+
+    TestArray(CkMigrateMessage* m) {}
+
+    ~TestArray()
+    {
+    }
+
+    // threaded EP
+    void Start()
+    {
+        arr1.enroll(numWorkers); // barrier
+        FillArray();
+        SyncArrays();
+        TestResults();  // test before doing a reduction
+        Contribute();
+    }
+
+    void Kontinue()
+    {
+        TestResults();
+        Contribute();
+    }
+};
+
+#include "t3.def.h"
diff --git a/examples/multiphaseSharedArrays/simpletest/t3.ci b/examples/multiphaseSharedArrays/simpletest/t3.ci
new file mode 100644 (file)
index 0000000..5c3a06c
--- /dev/null
@@ -0,0 +1,21 @@
+// -*- mode: c++; tab-width: 4 -*-
+mainmodule t3
+{
+    mainchare t3
+    {
+        entry void t3(CkArgMsg*);
+        entry void done(CkReductionMsg*);
+    };
+
+    array[1D] TestArray
+    {
+        entry void TestArray(MSA2DRM arr_, unsigned int numWorkers_);
+        entry [threaded] void Start();
+        entry [threaded] void Kontinue();
+    };
+    
+    /* Currently, you must explicitly instantiate any
+       MSA templates that you use. */
+    group MSA_CacheGroup<double, DefaultEntry<double>, MSA_DEFAULT_ENTRIES_PER_PAGE>;
+    array [1D] MSA_PageArray<double, DefaultEntry<double>, MSA_DEFAULT_ENTRIES_PER_PAGE>;
+};