New files and modifications to LBDatabase to include Team Load Balancer, a two stage...
authorEsteban Meneses <emenese2@illinois.edu>
Wed, 24 Nov 2010 17:44:18 +0000 (11:44 -0600)
committerEsteban Meneses <emenese2@illinois.edu>
Wed, 24 Nov 2010 17:44:18 +0000 (11:44 -0600)
src/ck-ldb/EveryLB.ci
src/ck-ldb/LBDatabase.C
src/ck-ldb/LBDatabase.h
src/ck-ldb/Make.lb
src/ck-ldb/TeamLB.C [new file with mode: 0644]
src/ck-ldb/TeamLB.ci [new file with mode: 0644]
src/ck-ldb/TeamLB.h [new file with mode: 0644]
src/ck-ldb/libmoduleTeamLB.dep [new file with mode: 0644]
src/scripts/Make.cidepends
src/scripts/Make.depends

index ea6959c49e3a53e1d56b682f8ee561d2801d76a6..c8d0df5afb58f0ff67de5a8c68a5f3d22200a47b 100644 (file)
@@ -15,6 +15,7 @@ module EveryLB {
   extern module GridHybridLB;
   extern module GridHybridSeedLB;
   extern module GridMetisLB;
+  extern module TeamLB;
   extern module HbmLB;
   extern module HybridLB;
   extern module MetisLB;
index ff2e0cae533dfb1ab48b4bbe4ac2ed6b0bede10b..a3e0f028dbf16fcdd228b2e09fb33613f59b4715 100644 (file)
@@ -244,6 +244,11 @@ void _loadbalancerInit()
     _lb_args.debug() = CmiGetArgFlagDesc(argv, "+LBDebug", 
                                             "Turn on LB debugging printouts");
 
+  // getting the size of the team with +teamSize
+  if (!CmiGetArgIntDesc(argv, "+teamSize", &_lb_args.teamSize(), 
+                                          "Team size"))
+    _lb_args.teamSize() = 1;
+
   // ask to print summary/quality of load balancer
   _lb_args.printSummary() = CmiGetArgFlagDesc(argv, "+LBPrintSummary",
                "Print load balancing result summary");
index 1d6bd8ce9977134c3699b178484a49f7e59b70d5..335ce1427628f60d3fcf09a6ca7cff3ca0eb4799 100644 (file)
@@ -41,6 +41,7 @@ private:
   int _lb_traceComm;           // stats collection for comm
   int _lb_central_pe;           // processor number for centralized startegy
   int _lb_percentMovesAllowed; //Specifies restriction on num of chares to be moved(as a percentage of total number of chares). Used by RefineKLB
+  int _lb_teamSize;            // specifies the team size for TeamLB
 public:
   CkLBArgs() {
 #if CMK_BLUEGENE_CHARM
@@ -54,9 +55,11 @@ public:
     _lb_percentMovesAllowed=100;
     _lb_loop = 0;
     _lb_central_pe = 0;
+       _lb_teamSize = 1;
   }
   inline double & lbperiod() { return _autoLbPeriod; }
   inline int & debug() { return _lb_debug; }
+  inline int & teamSize() {return _lb_teamSize; }
   inline int & printSummary() { return _lb_printsumamry; }
   inline int & lbversion() { return _lb_version; }
   inline int & loop() { return _lb_loop; }
index 9e75180e2712a81a4593866201e9745215544ccd..baa5ae6cfa8847e80389ec9bd946b694a55f78e3 100644 (file)
@@ -17,6 +17,7 @@ ALL_LDBS=\
    $(L)/libmoduleGridHybridLB.a \
    $(L)/libmoduleGridHybridSeedLB.a \
    $(L)/libmoduleGridMetisLB.a \
+   $(L)/libmoduleTeamLB.a \
    $(L)/libmoduleHbmLB.a \
    $(L)/libmoduleHybridLB.a \
    $(L)/libmoduleMetisLB.a \
@@ -120,6 +121,12 @@ $(L)/libmoduleGridMetisLB.a: GridMetisLB.o
 LBHEADERS += GridMetisLB.h GridMetisLB.decl.h
 
 
+$(L)/libmoduleTeamLB.a: TeamLB.o 
+       $(CHARMC) -o $(L)/libmoduleTeamLB.a TeamLB.o 
+       cp -f libmoduleTeamLB.dep $(L)/
+LBHEADERS += TeamLB.h TeamLB.decl.h
+
+
 $(L)/libmoduleHbmLB.a: HbmLB.o 
        $(CHARMC) -o $(L)/libmoduleHbmLB.a HbmLB.o 
        
@@ -232,6 +239,7 @@ ALL_LB_OBJS=EveryLB.o \
     GridHybridLB.o \
     GridHybridSeedLB.o \
     GridMetisLB.o \
+    TeamLB.o \
     HbmLB.o \
     HybridLB.o \
     MetisLB.o \
@@ -265,6 +273,7 @@ EVERYLB_DEPS=EveryLB.o \
     GridHybridLB.o \
     GridHybridSeedLB.o \
     GridMetisLB.o \
+    TeamLB.o \
     HbmLB.o \
     HybridLB.o \
     MetisLB.o \
diff --git a/src/ck-ldb/TeamLB.C b/src/ck-ldb/TeamLB.C
new file mode 100644 (file)
index 0000000..e3ac2b0
--- /dev/null
@@ -0,0 +1,319 @@
+/*****************************************************************************
+ * $Source$
+ * $Author$
+ * $Date$
+ * $Revision$
+ *****************************************************************************/
+
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+#include <charm++.h>
+
+#include "cklists.h"
+
+#include "TeamLB.h"
+
+CreateLBFunc_Def(TeamLB, "Use Metis(tm) to partition object graph at two levels: team level and processor level")
+
+TeamLB::TeamLB(const CkLBOptions &opt): CentralLB(opt)
+{
+  lbname = "TeamLB";
+  if (CkMyPe() == 0)
+    CkPrintf("[%d] TeamLB created\n",CkMyPe());
+
+       // setting number of teams and team size
+       teamSize = _lb_args.teamSize();
+       numberTeams = CkNumPes() / teamSize;
+}
+
+static void printStats(int count, int numobjs, double *cputimes, 
+                       int **comm, int *map)
+{
+  int i, j;
+  double *petimes = new double[count];
+  for(i=0;i<count;i++) {
+    petimes[i] = 0.0;
+  }
+  for(i=0;i<numobjs;i++) {
+    petimes[map[i]] += cputimes[i];
+  }
+  double maxpe = petimes[0], minpe = petimes[0];
+  CkPrintf("\tPE\tTimexSpeed\n");
+  for(i=0;i<count;i++) {
+    CkPrintf("\t%d\t%lf\n",i,petimes[i]);
+    if(maxpe < petimes[i])
+      maxpe = petimes[i];
+    if(minpe > petimes[i])
+      minpe = petimes[i];
+  }
+  delete[] petimes;
+  CkPrintf("\tLoad Imbalance=%lf seconds\n", maxpe-minpe);
+  int ncomm = 0;
+  for(i=0;i<numobjs;i++) {
+    for(j=0;j<numobjs;j++) {
+      if(map[i] != map[j])
+        ncomm += comm[i][j];
+    }
+  }
+  CkPrintf("\tCommunication (off proc msgs) = %d\n", ncomm/2);
+}
+
+/**
+ * @brief METIS function that performs a balanced k-way partitioning of the graph, considering
+ * the communication volume (hence the "V" in the name of the function).
+ */
+extern "C" void METIS_PartGraphVKway(int*, int*, int*, int*, int*,
+                                    int*, int*, int*, int*,
+                                    int*, int*);
+
+
+/**
+ * @brief Load balancing function. It uses METIS in a two step approach. The first step consists in
+ * splitting the objects into teams. METIS is able to minimize the communication volume across the teams
+ * while balancing the load among the different teams. The second step goes deep in each team to balance
+ * the load in the processors belonging to that particular team.
+ */
+void TeamLB::work(LDStats* stats)
+{
+       int i, j, m, k, node;
+       int option = 0;
+
+       if (_lb_args.debug() >= 2) {
+               CkPrintf("[%d] In TeamLB Strategy...\n", CkMyPe());
+       }
+
+       // making a communication hash
+       stats->makeCommHash();
+
+       int n_pes = stats->nprocs();
+       int numobjs = stats->n_objs;
+
+       // removing non-migratable objects
+       removeNonMigratable(stats, n_pes);
+
+       // allocate space for the computing data
+       double *objtime = new double[numobjs];
+       int *objwt = new int[numobjs];
+       int *teamObjwt = new int[numobjs];
+       int *origmap = new int[numobjs];
+       LDObjHandle *handles = new LDObjHandle[numobjs];
+
+       for(i=0;i<numobjs;i++) {
+               objtime[i] = 0.0;
+               objwt[i] = 0;
+               origmap[i] = 0;
+       }
+
+       for (i=0; i<stats->n_objs; i++) {
+               LDObjData &odata = stats->objData[i];
+               if (!odata.migratable) 
+                       CmiAbort("TeamLB does not dupport nonmigratable object.\n");
+               int frompe = stats->from_proc[i];
+               origmap[i] = frompe;
+               objtime[i] = odata.wallTime*stats->procs[frompe].pe_speed;
+               handles[i] = odata.handle;
+       }
+
+       // to convert the weights on vertices to integers
+       double max_objtime = objtime[0];
+       for(i=0; i<numobjs; i++) {
+               if(max_objtime < objtime[i])
+                       max_objtime = objtime[i];
+       }
+       double ratio = 1000.0/max_objtime;
+       for(i=0; i<numobjs; i++) {
+               objwt[i] = (int)(objtime[i]*ratio);
+       }
+       int **comm = new int*[numobjs];
+       for (i=0; i<numobjs; i++) {
+               comm[i] = new int[numobjs];
+               for (j=0; j<numobjs; j++)  {
+                       comm[i][j] = 0;
+               }
+       }
+
+       const int csz = stats->n_comm;
+       for(i=0; i<csz; i++) {
+               LDCommData &cdata = stats->commData[i];
+               if(!cdata.from_proc() && cdata.receiver.get_type() == LD_OBJ_MSG){
+                       int senderID = stats->getHash(cdata.sender);
+                       int recverID = stats->getHash(cdata.receiver.get_destObj());
+                       if (stats->complete_flag == 0 && recverID == -1) continue;
+                       CmiAssert(senderID < numobjs && senderID >= 0);
+                       CmiAssert(recverID < numobjs && recverID >= 0);
+                       comm[senderID][recverID] += cdata.messages;
+                       comm[recverID][senderID] += cdata.messages;
+               } else if (cdata.receiver.get_type() == LD_OBJLIST_MSG) {
+                       int nobjs;
+                       LDObjKey *objs = cdata.receiver.get_destObjs(nobjs);
+                       int senderID = stats->getHash(cdata.sender);
+                       for (j=0; j<nobjs; j++) {
+                               int recverID = stats->getHash(objs[j]);
+                               if((senderID == -1)||(recverID == -1))
+                                       if (_lb_args.migObjOnly()) continue;
+                                       else CkAbort("Error in search\n");
+                               comm[senderID][recverID] += cdata.messages;
+                               comm[recverID][senderID] += cdata.messages;
+                       }
+               }
+       }
+
+       // ignore messages sent from an object to itself
+       for (i=0; i<numobjs; i++)
+               comm[i][i] = 0;
+
+       // construct the graph in CSR format
+       int *xadj = new int[numobjs+1];
+       int *teamXadj = new int[numobjs+1];
+       int numedges = 0;
+       for(i=0;i<numobjs;i++) {
+               for(j=0;j<numobjs;j++) {
+                       if(comm[i][j] != 0)
+                               numedges++;
+               }
+       }
+       int *adjncy = new int[numedges];
+       int *teamAdjncy = new int[numedges];
+       int *edgewt = new int[numedges];
+       int *teamEdgewt = new int[numedges];
+       xadj[0] = 0;
+       int count4all = 0;
+       for (i=0; i<numobjs; i++) {
+               for (j=0; j<numobjs; j++) { 
+                       if (comm[i][j] != 0) { 
+                               adjncy[count4all] = j;
+                               edgewt[count4all++] = comm[i][j];
+                       }
+               }
+               xadj[i+1] = count4all;
+       }
+
+       if (_lb_args.debug() >= 2) {
+               CkPrintf("Pre-LDB Statistics step %d\n", step());
+               printStats(n_pes, numobjs, objtime, comm, origmap);
+       }
+
+       int wgtflag = 3; // Weights both on vertices and edges
+       int numflag = 0; // C Style numbering
+       int options[5];
+       options[0] = 0;
+       int edgecut, teamEdgecut;
+       int *newmap, *teamMap, *mapping, *invMapping, *procMapping;
+       int sameMapFlag = 1;
+
+       if (n_pes < 1) {
+               CkPrintf("error: Number of Pe less than 1!");
+       } else if (n_pes == 1) {
+               newmap = origmap;
+               sameMapFlag = 1;
+       } else {
+               sameMapFlag = 0;
+               newmap = new int[numobjs];
+               teamMap = new int[numobjs];
+               mapping = new int[numobjs];
+               invMapping = new int[numobjs];
+               procMapping = new int[numobjs];
+               if (_lb_args.debug() >= 1)
+                       CkPrintf("[%d] calling METIS_PartGraphVKway.\n", CkMyPe());
+               METIS_PartGraphVKway(&numobjs, xadj, adjncy, objwt, edgewt,
+                       &wgtflag, &numflag, &numberTeams, options,
+                       &edgecut, teamMap);
+               if (_lb_args.debug() >= 1)
+                       CkPrintf("[%d] after calling METIS_PartGraphVKway.\n", CkMyPe());
+       }
+
+       // partitioning each team
+       if(teamSize > 1){
+
+               // traversing the list of teams and load balancing each one
+               for(i=0; i<numberTeams; i++){
+                       int teamMembers = 0;
+
+                       // collecting all the elements of a particular team
+                       // mapping stores the association of local to global index
+                       // invMapping stores the inverse association
+                       for(j=0; j<numobjs; j++){
+                               if(teamMap[j] == i){
+                                       mapping[teamMembers] = j;
+                                       invMapping[j] = teamMembers;
+                                       teamObjwt[teamMembers] = objwt[j];
+                                       teamMembers++;
+                               }
+                       }
+
+                       // building up the adjacency data structures
+                       int teamIndex = 0;
+                       for(j=0; j<teamMembers; j++){
+                               teamXadj[j] = teamIndex;
+                               for(k=xadj[mapping[j]]; k<xadj[mapping[j]+1]; k++){
+                                       node = adjncy[k];
+                                       if(teamMap[node] == i){
+                                               teamAdjncy[teamIndex] = invMapping[node];
+                                               teamEdgewt[teamIndex] = edgewt[k];
+                                               teamIndex++;
+                                       }
+                               } 
+                       }
+                       teamXadj[teamMembers] = teamIndex;
+
+                       // calling METIS library
+                       METIS_PartGraphVKway(&teamMembers, teamXadj, teamAdjncy, teamObjwt, teamEdgewt, 
+                                                               &wgtflag, &numflag, &teamSize, options, &teamEdgecut, procMapping);
+
+                       // converting local mapping into global mapping
+                       for(j=0; j<teamMembers; j++){
+                               newmap[mapping[j]] = i*teamSize + procMapping[j];
+                       }
+                       
+
+               }
+
+               delete[] mapping;
+               delete[] invMapping;
+               delete[] procMapping;
+               delete[] teamMap;
+
+       } else {
+               delete[] newmap;
+               newmap = teamMap;
+       }
+       
+       if (_lb_args.debug() >= 2) {
+               CkPrintf("Post-LDB Statistics step %d\n", step());
+               printStats(n_pes, numobjs, objtime, comm, newmap);
+       }
+
+       for(i=0;i<numobjs;i++)
+               delete[] comm[i];
+       delete[] comm;
+       delete[] objtime;
+       delete[] xadj;
+       delete[] adjncy;
+       if(objwt) delete[] objwt;
+       if(edgewt) delete[] edgewt;
+       
+       if(!sameMapFlag) {
+               for(i=0; i<numobjs; i++) {
+                       if(origmap[i] != newmap[i]) {
+                               CmiAssert(stats->from_proc[i] == origmap[i]);
+                               stats->to_proc[i] =  newmap[i];
+                               if (_lb_args.debug() >= 3)
+                                       CkPrintf("[%d] Obj %d migrating from %d to %d\n", CkMyPe(),i,stats->from_proc[i],stats->to_proc[i]);
+                       }
+               }
+       }
+       
+       delete[] origmap;
+       if(newmap != origmap)
+               delete[] newmap;
+       if (_lb_args.debug() >= 1) {
+               CkPrintf("[%d] TeamLB done! \n", CkMyPe());
+       }
+}
+
+#include "TeamLB.def.h"
+
+/*@}*/
diff --git a/src/ck-ldb/TeamLB.ci b/src/ck-ldb/TeamLB.ci
new file mode 100644 (file)
index 0000000..03cdf1c
--- /dev/null
@@ -0,0 +1,9 @@
+module TeamLB {
+
+extern module CentralLB;
+initnode void lbinit(void);
+group [migratable] TeamLB : CentralLB {
+  entry void TeamLB(const CkLBOptions &);  
+};
+
+};
diff --git a/src/ck-ldb/TeamLB.h b/src/ck-ldb/TeamLB.h
new file mode 100644 (file)
index 0000000..80ffb45
--- /dev/null
@@ -0,0 +1,38 @@
+/*****************************************************************************
+ * $Source$
+ * $Author$
+ * $Date$
+ * $Revision$
+ *****************************************************************************/
+
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+#ifndef _TEAMLB_H_
+#define _TEAMLB_H_
+
+#include "CentralLB.h"
+#include "TeamLB.decl.h"
+
+void CreateTeamLB();
+BaseLB * AllocateTeamLB();
+
+class TeamLB : public CentralLB {
+public:
+  TeamLB(const CkLBOptions &);
+  TeamLB(CkMigrateMessage *m):CentralLB(m) { lbname = "TeamLB"; }
+private:
+       int teamSize;
+       int numberTeams;
+  CmiBool QueryBalanceNow(int step) { return CmiTrue; }
+  void work(LDStats* stats);
+};
+
+#define WEIGHTED 1
+#define MULTI_CONSTRAINT 2
+
+#endif /* _TEAMLB_H_ */
+
+/*@}*/
diff --git a/src/ck-ldb/libmoduleTeamLB.dep b/src/ck-ldb/libmoduleTeamLB.dep
new file mode 100644 (file)
index 0000000..4558eef
--- /dev/null
@@ -0,0 +1 @@
+-lmetis
index 003497bd2add39ab590e5e27d84b1a5e979978c1..5f64b6b828428221f92053050e2924c345d6ac95 100644 (file)
@@ -32,6 +32,7 @@ GridCommRefineLB.decl.h GridCommRefineLB.def.h: GridCommRefineLB.ci.stamp
 GridHybridLB.decl.h GridHybridLB.def.h: GridHybridLB.ci.stamp
 GridHybridSeedLB.decl.h GridHybridSeedLB.def.h: GridHybridSeedLB.ci.stamp
 GridMetisLB.decl.h GridMetisLB.def.h: GridMetisLB.ci.stamp
+TeamLB.decl.h TeamLB.def.h: TeamLB.ci.stamp
 HbmLB.decl.h HbmLB.def.h: HbmLB.ci.stamp
 HybridBaseLB.decl.h HybridBaseLB.def.h: HybridBaseLB.ci.stamp
 HybridLB.decl.h HybridLB.def.h: HybridLB.ci.stamp
index d9d448c885924063c7f97d2d1fa207b7d90609d5..125df0628f5c6ef0fc3a59f0ba04745eec18085c 100644 (file)
@@ -1439,7 +1439,7 @@ EveryLB.o: EveryLB.C charm++.h charm.h converse.h conv-config.h \
  GraphPartLB.decl.h GraphBFTLB.decl.h GreedyAgentLB.decl.h \
  GreedyCommLB.decl.h GreedyLB.decl.h GridCommLB.decl.h \
  GridCommRefineLB.decl.h GridHybridLB.decl.h GridHybridSeedLB.decl.h \
- GridMetisLB.decl.h HbmLB.decl.h NeighborLBMsg.h HybridLBMsg.h \
+ GridMetisLB.decl.h TeamLB.decl.h HbmLB.decl.h NeighborLBMsg.h HybridLBMsg.h \
  HybridLB.decl.h HybridBaseLB.decl.h MetisLB.decl.h NeighborCommLB.decl.h \
  NborBaseLB.decl.h NeighborLB.decl.h OrbLB.decl.h PhasebyArrayLB.decl.h \
  RandCentLB.decl.h RefineLB.decl.h RefineCommLB.decl.h RefineKLB.decl.h \
@@ -1738,6 +1738,25 @@ GridMetisLB.o: GridMetisLB.C GridMetisLB.decl.h charm++.h charm.h \
  manager.h GridMetisLB.def.h
        $(CHARMC) -c -I. GridMetisLB.C
 
+TeamLB.o: TeamLB.C TeamLB.decl.h charm++.h charm.h \
+ converse.h conv-config.h conv-autoconfig.h conv-common.h conv-mach.h \
+ conv-mach-opt.h pup_c.h queueing.h conv-cpm.h conv-cpath.h conv-qd.h \
+ conv-random.h conv-lists.h conv-trace.h persistent.h debug-conv.h pup.h \
+ middle.h middle-conv.h cklists.h ckbitvector.h ckstream.h init.h \
+ ckhashtable.h debug-charm.h simd.h CkMarshall.decl.h cksection.h \
+ ckcallback.h conv-ccs.h sockRoutines.h ccs-server.h ckobjQ.h \
+ ckreduction.h CkReduction.decl.h cknodegroupreduction.h \
+ CkArrayReductionMgr.decl.h ckmemcheckpoint.h CkMemCheckpoint.decl.h \
+ readonly.h ckarray.h cklocation.h LBDatabase.h lbdb.h LBDBManager.h \
+ LBObj.h LBOM.h LBComm.h LBMachineUtil.h lbdb++.h LBDatabase.decl.h \
+ NullLB.decl.h BaseLB.decl.h CkLocation.decl.h CkArray.decl.h \
+ CkFutures.decl.h charisma.h charisma.decl.h tempo.h tempo.decl.h \
+ waitqd.h waitqd.decl.h sdag.h ckcheckpoint.h CkCheckpoint.decl.h \
+ ckevacuation.h ckarrayreductionmgr.h trace.h trace-bluegene.h envelope.h \
+ CentralLB.decl.h CentralLBMsg.h TeamLB.h CentralLB.h BaseLB.h \
+ manager.h TeamLB.def.h
+       $(CHARMC) -c -I. TeamLB.C
+
 HbmLB.o: HbmLB.C charm++.h charm.h converse.h conv-config.h \
  conv-autoconfig.h conv-common.h conv-mach.h conv-mach-opt.h pup_c.h \
  queueing.h conv-cpm.h conv-cpath.h conv-qd.h conv-random.h conv-lists.h \