optimize for recover
authorXiang Ni <xiangni@hopper06.(none)>
Wed, 9 Jan 2013 20:27:58 +0000 (12:27 -0800)
committerXiang Ni <xiangni@hopper06.(none)>
Wed, 9 Jan 2013 20:27:58 +0000 (12:27 -0800)
src/ck-core/ckmemcheckpoint.C
src/util/pup_util.C

index 1506822240cd66af53b397382715203e34fcd186..204cbe3aed5520d86c9b1d574a6d2762e9172649 100644 (file)
@@ -709,7 +709,7 @@ void CkMemCheckPT::startCheckpoint(){
   int checksum;
   {
 //#if CMK_USE_CHECKSUM
   int checksum;
   {
 //#if CMK_USE_CHECKSUM
-    if(CpvAccess(use_checksum)){
+    if(CpvAccess(use_checksum)&&CkReplicaAlive()==1){
       PUP::checker p(msg->packData);
       pupAllElements(p);
       checksum = p.getChecksum();
       PUP::checker p(msg->packData);
       pupAllElements(p);
       checksum = p.getChecksum();
@@ -776,6 +776,7 @@ void CkMemCheckPT::startCheckpoint(){
   }
   else{
     if(CkReplicaAlive()==0){//TODO add flag if sent already but the replica hasn't recovered when the next checkpoint
   }
   else{
     if(CkReplicaAlive()==0){//TODO add flag if sent already but the replica hasn't recovered when the next checkpoint
+      if(CkMyPe() == CpvAccess(_remoteCrashedNode))
       {        
         int pointer = CpvAccess(curPointer);
         //send the proc data
       {        
         int pointer = CpvAccess(curPointer);
         //send the proc data
@@ -807,7 +808,7 @@ void CkMemCheckPT::startCheckpoint(){
 void CkMemCheckPT::doneComparison(bool ret){
   int _ret = 1;
   if(!ret){
 void CkMemCheckPT::doneComparison(bool ret){
   int _ret = 1;
   if(!ret){
-    CkPrintf("[%d][%d] fail in doneComparison \n", CmiMyPartition(),CkMyPe());
+    //CkPrintf("[%d][%d] fail in doneComparison \n", CmiMyPartition(),CkMyPe());
     _ret = 0;
   }else{
     _ret = 1;
     _ret = 0;
   }else{
     _ret = 1;
@@ -1662,11 +1663,11 @@ void CkMemCheckPT::RollBack(){
       }
       if(CkMyPe()==0)
         CmiPrintf("[%d][%d]local comparison done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
       }
       if(CkMyPe()==0)
         CmiPrintf("[%d][%d]local comparison done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
-      int fault_num = pchecker.getFaultNum();
+      //int fault_num = pchecker.getFaultNum();
       bool result = pchecker.getResult();
       bool result = pchecker.getResult();
-      if(!result){
+      /*if(!result){
         CmiPrintf("[%d][%d]fault region %d\n",CmiMyPartition(),CkMyPe(),fault_num);
         CmiPrintf("[%d][%d]fault region %d\n",CmiMyPartition(),CkMyPe(),fault_num);
-      }
+      }*/
       return result;
     }
     int getChecksum(char * buf){
       return result;
     }
     int getChecksum(char * buf){
@@ -1818,7 +1819,7 @@ void CkMemCheckPT::RollBack(){
       CpvAccess(chkpBuf)[pointer] = chkpMsg;
       CpvAccess(recvdArrayChkp) =1;
       CkMemCheckPT::inRestarting = 1;
       CpvAccess(chkpBuf)[pointer] = chkpMsg;
       CpvAccess(recvdArrayChkp) =1;
       CkMemCheckPT::inRestarting = 1;
-      if(CpvAccess(recvdProcChkp) == 1){
+      if(CpvAccess(recvdProcChkp) == 1||CkMyPe()!= CpvAccess(_crashedNode)){
         _resume_charm_message();
         _diePE = CpvAccess(_crashedNode);
         //CmiPrintf("[%d] send to die pe %d\n",CkMyPe(),_diePE);
         _resume_charm_message();
         _diePE = CpvAccess(_crashedNode);
         //CmiPrintf("[%d] send to die pe %d\n",CkMyPe(),_diePE);
index 0839d2c1c4525ee415688ba227de65e2399edaf4..d493ac1ab3230639a86712c4bdcfe0200bb3dd88 100644 (file)
@@ -159,7 +159,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                   printf("found incorrect double %e %e diff %e\n",p1[i],p2[i],(p1[i]-p2[i]));
                 }       
                 result = result && false;
                   printf("found incorrect double %e %e diff %e\n",p1[i],p2[i],(p1[i]-p2[i]));
                 }       
                 result = result && false;
-                fault_bytes++;
+           //     fault_bytes++;
               }
             }
           }    
               }
             }
           }    
@@ -176,7 +176,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                 if(result)
                   printf("found incorrect int %d %d at %d total %d\n",p1[i],p2[i],i,n/itemSize);
                 result = result && false;
                 if(result)
                   printf("found incorrect int %d %d at %d total %d\n",p1[i],p2[i],i,n/itemSize);
                 result = result && false;
-                fault_bytes++;
+         //       fault_bytes++;
               }
             }
           }
               }
             }
           }
@@ -193,7 +193,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                 if(result)
                   printf("found incorrect char %d %d at %d, total %d\n",p1[i],p2[i],i,n/itemSize);
                 result = result && false;
                 if(result)
                   printf("found incorrect char %d %d at %d, total %d\n",p1[i],p2[i],i,n/itemSize);
                 result = result && false;
-                fault_bytes++;
+        //        fault_bytes++;
               }
             }
           }
               }
             }
           }