Bug #2046: TRAM higher-dimensional chare array bugfixes 29/4929/9
authorVenkatasubrahmanian Narayanan <vn7@illinois.edu>
Mon, 28 Jan 2019 20:22:20 +0000 (14:22 -0600)
committerEvan Ramos <evan@hpccharm.com>
Fri, 5 Apr 2019 03:28:33 +0000 (22:28 -0500)
Fixed the implementation of TRAM to properly support
higher-dimensional chare arrays. Also made a minor change to the
interface file parser to emit code compatible with this implementation.

Change-Id: Ica0d87aa65f827ba9d47fa9e9053defa71a0146a

examples/charm++/TRAM/randomAccessArray/randomAccess.C
examples/charm++/TRAM/randomAccessArray/randomAccess.ci
src/libs/ck-libs/NDMeshStreamer/NDMeshStreamer.C
src/libs/ck-libs/NDMeshStreamer/NDMeshStreamer.h
src/xlat-i/xi-Entry.C
tests/charm++/randomTRAM3D/Makefile [new file with mode: 0644]
tests/charm++/randomTRAM3D/tram3d.C [new file with mode: 0644]
tests/charm++/randomTRAM3D/tram3d.ci [new file with mode: 0644]

index 45dfc2b8f2d4f2404b75e6dc758b5fd53c02d84c..149474a8a387c30cb9878a9f88b091fbba6a27e5 100644 (file)
@@ -14,7 +14,7 @@ CmiInt8 localTableSize;
 // Handle to the test driver (chare)
 CProxy_TestDriver driverProxy;
 // Handle to the communication library (group)
-CProxy_ArrayMeshStreamer<dtype, int, Updater,
+CProxy_ArrayMeshStreamer<dtype, CkArrayIndex1D, Updater,
                          SimpleMeshRouter> aggregator;
 // Number of chares per PE
 int numElementsPerPe;
@@ -49,7 +49,7 @@ public:
 
     // Instantiate communication library group with a handle to the client
     aggregator =
-      CProxy_ArrayMeshStreamer<dtype, int, Updater, SimpleMeshRouter>
+      CProxy_ArrayMeshStreamer<dtype, CkArrayIndex1D, Updater, SimpleMeshRouter>
       ::ckNew(numMsgsBuffered, 2, dims, updater_array, 1);
 
     delete args;
@@ -139,7 +139,7 @@ public:
     CmiUInt8 key = HPCC_starts(4 * globalStartmyProc);
     // Get a pointer to the local communication library object
     //  from its proxy handle
-    ArrayMeshStreamer<dtype, int, Updater, SimpleMeshRouter>
+    ArrayMeshStreamer<dtype, CkArrayIndex1D, Updater, SimpleMeshRouter>
       * localAggregator = aggregator.ckLocalBranch();
 
     // Generate this chare's share of global updates
index 5d3d3552c5ddf95c13aba5338464dc03d0ead8fa..bdca1e52f5fbb342c15c897a78e015f344fa8f95 100644 (file)
@@ -7,7 +7,7 @@ mainmodule randomAccess {
   // Handle to the test driver (chare)
   readonly CProxy_TestDriver               driverProxy;
   // Handle to the communication library (group)
-  readonly CProxy_ArrayMeshStreamer<dtype, int, Updater,
+  readonly CProxy_ArrayMeshStreamer<dtype, CkArrayIndex1D, Updater,
                                     SimpleMeshRouter> aggregator;
   // Number of chares per PE
   readonly int            numElementsPerPe;
@@ -29,8 +29,8 @@ mainmodule randomAccess {
   };
 
   // Setup required for the communication library
-  message MeshStreamerMessage<ArrayDataItem<dtype, int> >;
-  group ArrayMeshStreamer<dtype, int, Updater, SimpleMeshRouter>;
-  group MeshStreamer<ArrayDataItem<dtype, int>, SimpleMeshRouter>;
+  message MeshStreamerMessage<ArrayDataItem<dtype, CkArrayIndex1D> >;
+  group ArrayMeshStreamer<dtype, CkArrayIndex1D, Updater, SimpleMeshRouter>;
+  group MeshStreamer<ArrayDataItem<dtype, CkArrayIndex1D>, SimpleMeshRouter>;
 
 };
index 278e0066278fd7d5b41804ea00513abf48375924..2537f954d7731625afac1429e868842f2427431f 100644 (file)
@@ -2,3 +2,29 @@
 
 #include "NDMeshStreamer.h"
 #include "NDMeshStreamer.def.h"
+
+//below code initializes the templated static variables from the header
+CkArrayIndex1D TramBroadcastInstance<CkArrayIndex1D>::value=TRAM_BROADCAST;
+
+CkArrayIndex2D TramBroadcastInstance<CkArrayIndex2D>::value=CkArrayIndex2D(TRAM_BROADCAST,TRAM_BROADCAST);
+
+CkArrayIndex3D TramBroadcastInstance<CkArrayIndex3D>::value=CkArrayIndex3D(TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST);
+
+CkArrayIndex4D TramBroadcastInstance<CkArrayIndex4D>::value=CkArrayIndex4D(TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST);
+
+CkArrayIndex5D TramBroadcastInstance<CkArrayIndex5D>::value=CkArrayIndex5D(TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST);
+
+CkArrayIndex6D TramBroadcastInstance<CkArrayIndex6D>::value=CkArrayIndex6D(TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST,TRAM_BROADCAST);
+
+//Below code picks the appropriate TRAM_BROADCAST index value
+CkArrayIndex& TramBroadcastInstance<CkArrayIndex>::value(int dims) {
+  switch(dims) {
+    case 1: return TramBroadcastInstance<CkArrayIndex1D>::value;
+    case 2: return TramBroadcastInstance<CkArrayIndex2D>::value;
+    case 3: return TramBroadcastInstance<CkArrayIndex3D>::value;
+    case 4: return TramBroadcastInstance<CkArrayIndex4D>::value;
+    case 5: return TramBroadcastInstance<CkArrayIndex5D>::value;
+    case 6: return TramBroadcastInstance<CkArrayIndex6D>::value;
+    default: CmiAbort("TRAM only supports 1-6D arrays\n");
+  }
+};
index 560e0cfa7a687b756371c467740e28433134d6b2..3de3c6c2f8f60e93da0daf999c3082374adb6e6b 100644 (file)
 
 extern void QdCreate(int n);
 extern void QdProcess(int n);
+//below code uses templates to generate appropriate TRAM_BROADCAST array index values
+template<class itype>
+struct TramBroadcastInstance;
+
+template<>
+struct TramBroadcastInstance<CkArrayIndex1D>{
+  static CkArrayIndex1D value;
+};
+
+template<>
+struct TramBroadcastInstance<CkArrayIndex2D>{
+  static CkArrayIndex2D value;
+};
+
+template<>
+struct TramBroadcastInstance<CkArrayIndex3D>{
+  static CkArrayIndex3D value;
+};
+
+template<>
+struct TramBroadcastInstance<CkArrayIndex4D>{
+  static CkArrayIndex4D value;
+};
+
+template<>
+struct TramBroadcastInstance<CkArrayIndex5D>{
+  static CkArrayIndex5D value;
+};
+
+template<>
+struct TramBroadcastInstance<CkArrayIndex6D>{
+  static CkArrayIndex6D value;
+};
+
+template<>
+struct TramBroadcastInstance<CkArrayIndex>{
+  static CkArrayIndex& value(int);
+};
 
 template<class dtype>
 class MeshStreamerMessage : public CMessage_MeshStreamerMessage<dtype> {
@@ -1002,7 +1040,7 @@ private:
   void localDeliver(const ArrayDataItem<dtype, itype>& packedDataItem) {
 
     itype arrayId = packedDataItem.arrayIndex;
-    if (arrayId == itype(TRAM_BROADCAST)) {
+    if (arrayId == TramBroadcastInstance<CkArrayIndex>::value(arrayId.dimension)) {
       localBroadcast(packedDataItem);
       return;
     }
@@ -1211,12 +1249,12 @@ public:
   void processLocationRequest(itype arrayId, int deliveredToPe, int sourcePe) {
     int ownerPe = clientArrayMgr_->lastKnown((CkArrayIndex)arrayId);
     this->thisProxy[deliveredToPe].resendMisdeliveredItems(arrayId, ownerPe);
-    this->thisProxy[sourcePe].updateLocationAtSource(arrayId, sourcePe);
+    this->thisProxy[sourcePe].updateLocationAtSource(arrayId, ownerPe);
   }
 
   void resendMisdeliveredItems(itype arrayId, int destinationPe) {
 
-    clientLocMgr_->updateLocation(arrayId, destinationPe);
+    clientLocMgr_->updateLocation(arrayId, clientLocMgr_->lookupID(arrayId),destinationPe);
 
     std::vector<ArrayDataItem<dtype, itype> > &bufferedItems
       = misdeliveredItems[arrayId];
@@ -1235,7 +1273,7 @@ public:
     int prevOwner = clientArrayMgr_->lastKnown((CkArrayIndex)arrayId);
 
     if (prevOwner != destinationPe) {
-      clientLocMgr_->updateLocation(arrayId, destinationPe);
+      clientLocMgr_->updateLocation(arrayId,clientLocMgr_->lookupID(arrayId), destinationPe);
 
       // it is possible to also fix destinations of items buffered for arrayId,
       // but the search could be expensive; instead, with the current code
index 3e741034ddb5be71d08c97c3a1d55f421eab480a..44a3f7cae3ebf3f5b6740c1db16bdb167075b7f6 100644 (file)
@@ -981,11 +981,7 @@ XStr Entry::aggregatorIndexType() {
   } else if (container->isArray()) {
     XStr dim, arrayIndexType;
     dim << ((Array*)container)->dim();
-    if (dim == "1D") {
-      indexType << "int";
-    } else {
-      indexType << "CkArrayIndex";
-    }
+     indexType << "CkArrayIndex";
   }
   return indexType;
 }
@@ -1088,13 +1084,7 @@ void Entry::genTramDefs(XStr& str) {
     str << "  const CkArrayIndex &myIndex = ckGetIndex();\n"
         << "  " << aggregatorName() << "->insertData<" << (isInline() ? "true" : "false")
         << ">(" << param->param->name;
-    if (dim == (const char*)"1D") {
-      str << ", "
-          << "myIndex.data()[0]);\n}\n";
-    } else {
-      str << ", "
-          << "myIndex);\n}\n";
-    }
+    str << ", " << "myIndex);\n}\n";
   }
 }
 
diff --git a/tests/charm++/randomTRAM3D/Makefile b/tests/charm++/randomTRAM3D/Makefile
new file mode 100644 (file)
index 0000000..37632e0
--- /dev/null
@@ -0,0 +1,22 @@
+-include ../../common.mk
+-include ../../../include/conv-mach-opt.mak
+CHARMC = ../../../bin/charmc $(OPTS)
+
+OBJS = tram3d.o
+
+all: tram3d
+
+tram3d: tram3d.o
+       $(CHARMC) $(CHARMCFLAGS) -language charm++ -o tram3d tram3d.o -module NDMeshStreamer
+
+tram3d.def.h: tram3d.ci
+       $(CHARMC) $(CHARMCFLAGS) tram3d.ci
+
+tram3d.o: tram3d.C tram3d.def.h
+       $(CHARMC) $(CHARMCFLAGS) -c tram3d.C
+
+test: tram3d
+       $(call run, ./tram3d +p4 )
+
+clean:
+       rm -f *.o *.decl.h *.def.h tram3d charmrun*
diff --git a/tests/charm++/randomTRAM3D/tram3d.C b/tests/charm++/randomTRAM3D/tram3d.C
new file mode 100644 (file)
index 0000000..24cf059
--- /dev/null
@@ -0,0 +1,95 @@
+#include "tram3d.decl.h"
+#include <algorithm>
+#include <climits>
+#include <random>
+#include <vector>
+CProxy_main master; //readonly
+class main : public CBase_main
+{
+  CProxy_Test blocks;
+  int N;
+
+ public:
+  main(CkArgMsg* args)
+  {
+    N = 2;
+    CkArrayOptions opts;
+    opts.setBounds(N, N, N);
+    blocks = CProxy_Test::ckNew(opts);
+    std::mt19937 engine(37);  // arbitrarily selected constant seed for reproducibility
+    std::uniform_int_distribution<> distro(INT_MIN, INT_MAX);
+    master = thisProxy;
+    for (int i = 0; i != N; ++i)
+    {
+      for (int j = 0; j != N; ++j)
+      {
+        for (int k = 0; k != N; ++k)
+        {
+          blocks(i, j, k).insert(distro(engine), N);
+        }
+      }
+    }
+    blocks.doneInserting();
+    blocks.run();
+    delete args;
+  }
+  void endexec(int val)
+  {
+    /* The test has 4 phases: initialization, distribution, and
+    reduction.
+
+    In phase 1, the main thread generates a sequence of starter
+    values for each element of the array, using a well-defined
+    constant as the initial seed. Each thread stores a series of
+    N*N*N values generated by incrementing the starter values they
+    received.
+
+    In phase 2, all the threads redistribute their values by sending
+    them to elements of the array based on their indices(including
+    themselves).
+
+    In phase 3, after all the threads have received the new values,
+    they contribute the minimum of all the values they received into
+    a sum-reduction.
+
+    The value below is obtained by running the test code without
+    TRAM enabled, with the same deterministic seed.
+    */
+    if (val != 488803188)
+    {
+      CkAbort("Messages not delivered correctly!");
+    }
+    else
+    {
+      CkPrintf("The sum of minimal values across chares is %d\n", val);
+      CkExit();
+    }
+  }
+};
+class Test : public CBase_Test
+{
+  Test_SDAG_CODE
+  std::vector<int> values;
+  std::vector<int> recvd;
+  int N;
+  int count1, count2, count3;
+
+ public:
+  Test() {}
+  Test(int seed, int N)
+      : values([this, seed, N]() mutable {
+          std::vector<int> temp;
+          temp.reserve(N * N * N);
+          std::generate_n(std::back_inserter(temp), N * N * N,
+                          [seed]() mutable { return seed++; });
+          return temp;
+        }()),
+        N(N),
+        count1(0),
+        count2(0),
+        count3(0)
+  {
+    recvd.reserve(N * N * N);
+  }
+};
+#include "tram3d.def.h"
diff --git a/tests/charm++/randomTRAM3D/tram3d.ci b/tests/charm++/randomTRAM3D/tram3d.ci
new file mode 100644 (file)
index 0000000..5cb92cc
--- /dev/null
@@ -0,0 +1,32 @@
+mainmodule tram3d {
+  mainchare main {
+    entry main(CkArgMsg*);
+    entry [reductiontarget] void endexec(int);
+  };
+  readonly CProxy_main master;
+  array [3D] Test {
+    entry Test();
+    entry Test(int,int);
+    entry void run() {
+      for (count1=0;count1!=N;++count1) {
+        for (count2=0;count2!=N;++count2) {
+          for (count3=0;count3!=N;++count3) serial {
+            thisProxy(count1,count2,count3).clock(values[count1*N*N+count2*N+count3]);
+          }
+        }
+      }
+      for (count1=0;count1!=N*N*N;++count1) {
+        when clock(int j) serial {
+          recvd.emplace_back(j);
+        }
+      }
+      serial {
+        auto min_iter=std::min_element(recvd.begin(),recvd.end());
+        CkCallback cb(CkReductionTarget(main,endexec),master);
+        contribute(sizeof(int),&(*min_iter),CkReduction::sum_int,cb);
+        //addressof and dereference done since iterators != pointers
+      }
+    }
+    entry [aggregate] void clock(int);
+  };
+};