fix for migration
[charm.git] / src / ck-core / ckmemcheckpoint.C
index 26da67f9ed6878182f316eb20c62cd6926f8375f..c4dc028c8dd537c689026fdc02c8c05a34bcbdc7 100644 (file)
@@ -391,8 +391,8 @@ CkMemCheckPT::CkMemCheckPT(int w)
 #if CMK_CONVERSE_MPI
   void pingBuddy();
   void pingCheckHandler();
-  //CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
-  //CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+  CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+  CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
 #endif
   chkpTable[0] = NULL;
   chkpTable[1] = NULL;
@@ -424,8 +424,8 @@ void CkMemCheckPT::pup(PUP::er& p)
 #if CMK_CONVERSE_MPI
     void pingBuddy();
     void pingCheckHandler();
-    //CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
-    //CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+    CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+    CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
 #endif
     maxIter = -1;
     recvIterCount = 0;
@@ -661,7 +661,7 @@ void pupAllElements(PUP::er &p){
   p | numElements;
   if(!p.isUnpacking()){
     MemElementPacker packer(p);
-    CKLOCMGR_LOOP(mgr->iterate(packer););
+    CKLOCMGR_LOOP(mgr->iterateLocal(packer););
     packer.writeCheckpoint();
   }
 #endif
@@ -1188,22 +1188,22 @@ void CkMemCheckPT::RollBack(){
     void CkMemCheckPT::restart(int diePe)
     {
 #if CMK_MEM_CHECKPOINT
+      thisFailedPe = diePe;
       double curTime = CmiWallTimer();
-      if (CkMyPe() == diePe){
+      if (CkMyPe() == thisFailedPe){
         restartT = CmiWallTimer();
         CkPrintf("[%d] Process data restored in %f seconds\n", CkMyPe(), curTime - startTime);
       }
       stage = (char*)"resetLB";
       startTime = curTime;
-      if (CkMyPe() == diePe)
+      if (CkMyPe() == thisFailedPe)
         CkPrintf("[%d] CkMemCheckPT ----- restart.\n",CkMyPe());
 
-#if CK_NO_PROC_POOL
-      failed(diePe);   // add into the list of failed pes
-#endif
-      thisFailedPe = diePe;
+//#if CK_NO_PROC_POOL
+//      failed(diePe); // add into the list of failed pes
+//#endif
 
-      if (CkMyPe() == diePe) CmiAssert(ckTable.length() == 0);
+//      if (CkMyPe() == diePe) CmiAssert(ckTable.length() == 0);
 
       inRestarting = 1;
 
@@ -1253,7 +1253,8 @@ void CkMemCheckPT::RollBack(){
     void CkMemCheckPT::resetReductionMgr()
     {
       if (CkMyPe() == thisFailedPe) 
-        CkPrintf("[%d] CkMemCheckPT ----- resetReductionMgr\n",CkMyPe());
+        CkPrintf("[%d] CkMemCheckPT ----- resetReductionMgr at %lf\n",CkMyPe(),CmiWallTimer());
+      stage = (char *)"resetReductionMgr";
       int numGroups = CkpvAccess(_groupIDTable)->size();
       for(int i=0;i<numGroups;i++) {
         CkGroupID gID = (*CkpvAccess(_groupIDTable))[i];
@@ -1266,8 +1267,11 @@ void CkMemCheckPT::RollBack(){
       if(CmiNumPartition()==1){
         barrier(CkCallback(CkIndex_CkMemCheckPT::recoverBuddies(), thisProxy));
       }
-      else
+      else{
+       if (CkMyPe() == thisFailedPe) 
+         CkPrintf("[%d] CkMemCheckPT ----- resetReductionMgr ends at %lf\n",CkMyPe(),CmiWallTimer());
         barrier(CkCallback(CkReductionTarget(CkMemCheckPT, recoverArrayElements), thisProxy));
+      }
     }
 
     // recover the lost buddies
@@ -1356,10 +1360,8 @@ void CkMemCheckPT::RollBack(){
     {
       double curTime = CmiWallTimer();
       int len = ckTable.length();
-      //CkPrintf("[%d] CkMemCheckPT ----- %s len: %d in %f seconds \n",CkMyPe(), stage, len, curTime-startTime);
+      CkPrintf("[%d] CkMemCheckPT ----- %s len: %d in %f seconds \n",CkMyPe(), stage, len, curTime-startTime);
       stage = (char *)"recoverArrayElements";
-      if (CkMyPe() == thisFailedPe)
-        CmiPrintf("[%d] CkMemCheckPT ----- %s starts at %f \n",CkMyPe(), stage, curTime);
       startTime = curTime;
       int flag = 0;
       // recover all array elements
@@ -1429,6 +1431,8 @@ void CkMemCheckPT::RollBack(){
       delete [] gmap;
 #endif
       DEBUGF("[%d] recoverArrayElements restore %d objects\n", CkMyPe(), count);
+      CkPrintf("[%d] recoverArrayElements restore %d objects\n", CkMyPe(), count);
+      //if (CkMyPe() == thisFailedPe)
 
       CKLOCMGR_LOOP(mgr->doneInserting(););
 
@@ -1518,6 +1522,7 @@ void CkMemCheckPT::RollBack(){
         }
       }
       if (CmiMyPe() == BuddyPE(thisFailedPe)) {
+        lastPingTime = CmiWallTimer();
         CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
       }
 #endif
@@ -1635,7 +1640,7 @@ void CkMemCheckPT::RollBack(){
 #if CMK_MEM_CHECKPOINT
 #if CMK_USE_BARRIER
       if(CkMyPe()!=_diePE){
-        printf("restar begin on %d\n",CkMyPe());
+        printf("restart begin on %d at %lf\n",CkMyPe(),CmiWallTimer());
         char *restartmsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
         CmiSetHandler(restartmsg, restartBeginHandlerIdx);
         CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
@@ -1871,7 +1876,17 @@ void CkMemCheckPT::RollBack(){
         _diePE = CpvAccess(_crashedNode);
         char *restartmsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
         CmiSetHandler(restartmsg, restartBeginHandlerIdx);
-        CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+        //CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+#if CMK_USE_BARRIER
+      //CmiPrintf("before reduce\n");  
+       if(CpvAccess(resilience)==1){
+         CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+       }else
+         CmiReduce(restartmsg,CmiMsgHeaderSizeBytes,doNothingMsg);
+      //CmiPrintf("after reduce\n");   
+#else
+       CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+#endif 
       }
     }
 
@@ -1893,7 +1908,17 @@ void CkMemCheckPT::RollBack(){
         //CmiPrintf("[%d] send to die pe %d\n",CkMyPe(),_diePE);
         char *restartmsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
         CmiSetHandler(restartmsg, restartBeginHandlerIdx);
-        CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+        //CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+#if CMK_USE_BARRIER
+      //CmiPrintf("before reduce\n");  
+       if(CpvAccess(resilience)==1){
+         CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+       }else
+         CmiReduce(restartmsg,CmiMsgHeaderSizeBytes,doNothingMsg);
+      //CmiPrintf("after reduce\n");   
+#else
+       CmiSyncSendAndFree(_diePE, CmiMsgHeaderSizeBytes, (char *)restartmsg);
+#endif 
       }
     }
 
@@ -2265,7 +2290,7 @@ void CkMemCheckPT::RollBack(){
     {
 #if CMK_MEM_CHECKPOINT
       double now = CmiWallTimer();
-      if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
+      if (lastPingTime > 0 && now - lastPingTime > 4 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
         //if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb()) {
         int i, pe, buddy;
         // tell everyone the buddy dies