do not detect the failure on node 0 when load balancing is undergoing
authorXiang Ni <xiangni2@illinois.edu>
Fri, 19 Oct 2012 00:51:44 +0000 (19:51 -0500)
committerXiang Ni <xiangni2@illinois.edu>
Fri, 19 Oct 2012 00:51:44 +0000 (19:51 -0500)
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.h
src/ck-ldb/CentralLB.C

index ee7881eda5fbb6bf210f51362715f4832c89eda9..173f9653c02b5c13b5bd41fc26d79981c73009b6 100644 (file)
@@ -71,6 +71,7 @@ CpvDeclare(int, _crashedNode);
 
 // static, so that it is accessible from Converse part
 int CkMemCheckPT::inRestarting = 0;
+int CkMemCheckPT::inLoadbalancing = 0;
 double CkMemCheckPT::startTime;
 char *CkMemCheckPT::stage;
 CkCallback CkMemCheckPT::cpCallback;
@@ -1349,6 +1350,27 @@ int CkInRestarting()
 #endif
 }
 
+extern "C"
+void CkSetInLdb(){
+#if CMK_MEM_CHECKPOINT
+       CkMemCheckPT::inLoadbalancing = 1;
+#endif
+}
+
+extern "C"
+int CkInLdb(){
+#if CMK_MEM_CHECKPOINT
+       return CkMemCheckPT::inLoadbalancing;
+#endif
+}
+
+extern "C"
+void CkResetInLdb(){
+#if CMK_MEM_CHECKPOINT
+       CkMemCheckPT::inLoadbalancing = 0;
+#endif
+}
+
 /*****************************************************************************
                 module initialization
 *****************************************************************************/
@@ -1457,7 +1479,7 @@ void pingCheckHandler()
 {
 #if CMK_MEM_CHECKPOINT
   double now = CmiWallTimer();
-  if (lastPingTime > 0 && now - lastPingTime > 4) {
+  if (lastPingTime > 0 && now - lastPingTime > 4 && (!CkInLdb()||buddy!=0)) {
     int i, pe, buddy;
     // tell everyone the buddy dies
     CkMemCheckPT *obj = CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch();
index 3829aab5a8aae791e91c6233a6c549ff4a37f668..901634ef06765f91e1d97aebfe5396f69a216bec 100644 (file)
@@ -97,6 +97,7 @@ public:
   static CkCallback  cpCallback;
 
   static int inRestarting;
+  static int inLoadbalancing;
   static double startTime;
   static char*  stage;
 private:
@@ -130,6 +131,10 @@ void CkStartMemCheckpoint(CkCallback &cb);
 
 // true if inside a restarting phase
 extern "C" int CkInRestarting(); 
+extern "C" int CkInLdb(); 
+extern "C" void CkSetInLdb(); 
+extern "C" void CkResetInLdb();
+
 extern "C" int CkHasCheckpoints();
 
 extern "C" void CkDieNow();
index 3c216822fa78a2329ddc8d5db4e52613497fee52..e0ec375a7e7e1247f94ab86986094c6c7b191a4d 100644 (file)
@@ -163,7 +163,9 @@ void CentralLB::AtSync()
 {
 #if CMK_LBDB_ON
   DEBUGF(("[%d] CentralLB AtSync step %d!!!!!\n",CkMyPe(),step()));
-
+#if CMK_MEM_CHECKPOINT 
+  CkSetInLdb();
+#endif
 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
        CpvAccess(_currentObj)=this;
 #endif
@@ -764,6 +766,9 @@ extern int restarted;
 void CentralLB::ReceiveMigration(LBMigrateMsg *m)
 {
   storedMigrateMsg = m;
+#if CMK_MEM_CHECKPOINT
+  CkResetInLdb();
+#endif
 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
        restoreParallelRecovery(&resumeAfterRestoreParallelRecovery,(void *)this);
 #else