fix a bug
authorNikhil Jain <nikhil@illinois.edu>
Sun, 30 Dec 2012 18:37:44 +0000 (18:37 +0000)
committerNikhil Jain <nikhil@illinois.edu>
Sun, 30 Dec 2012 18:37:44 +0000 (18:37 +0000)
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.ci
src/ck-core/ckmemcheckpoint.h

index 8fd099222988f0e5bde419ddb4e41449249736bd..96351c9e0ec8568f30a0bf9e35cda6945a93255b 100644 (file)
@@ -373,6 +373,7 @@ CkMemCheckPT::CkMemCheckPT(int w)
   expectCount = -1;
   where = w;
   replicaAlive = 1;
+  notifyReplica = 0;
 #if CMK_CONVERSE_MPI
   void pingBuddy();
   void pingCheckHandler();
@@ -763,12 +764,15 @@ void CkMemCheckPT::doneRComparison(int ret){
   if(ret==CkNumPes()){
     CpvAccess(localChkpDone) = 1;
     if(CpvAccess(remoteChkpDone) ==1){
-      thisProxy.resumeFromChkp();
+      thisProxy.doneBothComparison();
+    }
+    if(notifyReplica == 0){
+      //notify the replica am done
+      char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
+      CmiSetHandler(msg,replicaChkpDoneHandlerIdx);
+      CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes,(char *)msg);
+      notifyReplica = 1;
     }
-    //notify the replica am done
-    char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
-    CmiSetHandler(msg,replicaChkpDoneHandlerIdx);
-    CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes,(char *)msg);
   }
   else{
     CkPrintf("[%d][%d] going to RollBack %d \n", CmiMyPartition(),CkMyPe(),ret);
@@ -776,13 +780,14 @@ void CkMemCheckPT::doneRComparison(int ret){
   }
 }
 
-void CkMemCheckPT::resumeFromChkp(){
+void CkMemCheckPT::doneBothComparison(){
   CpvAccess(recvdRemote) = 0;
   CpvAccess(recvdLocal) = 0;
   CpvAccess(localChkpDone) = 0;
   CpvAccess(remoteChkpDone) = 0;
   CpvAccess(curPointer)^=1;
   inCheckpointing = 0;
+  notifyReplica = 0;
   if(CkMyPe() == 0){
     CmiPrintf("[%d][%d] Checkpoint finished in %f seconds, sending callback ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime);
   }
@@ -1606,8 +1611,8 @@ void CkMemCheckPT::RollBack(){
       CkLocMgr * mgr = (CkLocMgr *)CkpvAccess(_groupTable)->find(gID).getObj();
       mgr->resume(idx,pchecker,CmiFalse,CmiFalse,CmiFalse);
       }
-      return pchecker.getResult();
-      //return true;
+      //return pchecker.getResult();
+      return true;
     }
 
     static void recvRemoteChkpHandler(char *msg){
index 6b111e9ed7732e3938fbc5382f96f582e95976f3..8e1ca759a88787673a65c08ca0b1308907f6b567 100644 (file)
@@ -37,7 +37,7 @@ module CkMemCheckpoint {
        entry [reductiontarget] void finishUp();
        entry [reductiontarget] void doneRComparison(int);
        entry [reductiontarget] void recoverFromSoftFailure();
-       entry void resumeFromChkp();
+       entry void doneBothComparison();
        entry [reductiontarget] void recvMaxIter(int iter);
        entry [reductiontarget] void startChkp();
        entry void recvIter(int iter);
index 21f128909a5cc73ff272b8d66f2d109147769921..c34d7a54898e57df0857a117b233c82c860e2a3e 100644 (file)
@@ -98,7 +98,7 @@ public:
   void startCheckpoint();
   void doneComparison(bool);
   void doneRComparison(int);
-  void resumeFromChkp();
+  void doneBothComparison();
   void RollBack();
   void recoverFromSoftFailure();
   void notifyReplicaDie(int diePe);
@@ -122,6 +122,8 @@ public:
   bool inProgress;
   bool localDecided;
   int localMaxIter;
+
+  int notifyReplica;
 private:
   CkVec<CkCheckPTInfo *> ckTable;
   CkArrayCheckPTMessage * chkpTable[2];