resolve some memory leak issue, inject failure to certain pe for better control
authorXiang Ni <xiangni2@illinois.edu>
Fri, 5 Apr 2013 17:50:54 +0000 (12:50 -0500)
committerXiang Ni <xiangni2@illinois.edu>
Fri, 5 Apr 2013 17:50:54 +0000 (12:50 -0500)
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.ci
src/ck-core/ckmemcheckpoint.h
src/util/pup_util.C

index 5e7774bf8deb3154701e534dd00c79619519292b..e9f5ba766228248a15ada0804cbba07bcaa6aefa 100644 (file)
@@ -413,10 +413,12 @@ CkMemCheckPT::CkMemCheckPT(int w)
   softFailureInjected = false;
   if(killFlag == 2){
     localSeed = failureSeed;
-    thisProxy[CkMyPe()].generateFailure();
+    softLocalSeed = failureSeed*2;
+    //only inject failure to pe [0][1]
+    if(CkMyPe()==0 && CmiMyPartition()==0)
+      thisProxy[CkMyPe()].generateFailure();
     
     if(SMTBF!=-1 && CmiMyPartition()==1 && CkMyPe()==0){
-      softLocalSeed = failureSeed*2;
       thisProxy[CkMyPe()].generateSoftFailure();
     }
 
@@ -430,24 +432,20 @@ void CkMemCheckPT::replicaInjectFailure(){
 }
 
 void CkMemCheckPT::generateFailure(){
-  int rand1 = rand_r(&localSeed);
-  int rand2 = rand_r(&localSeed);
   int rand3 = rand_r(&localSeed);
-  int next_pe = (rand1)%CkNumPes();
-  if(next_pe == 0){
-    next_pe = 1;
-  }
-  int next_partition = (rand2)%2;
   double sec;
   if(strcmp(failureDist,"E")==0)
     sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
   else if(strcmp(failureDist,"W")==0)
     sec = alpha*pow(-log(1.0f - ((double)rand3)/(long long int)(RAND_MAX)),1/beta);
-  if(next_pe == CmiMyPe()&& next_partition == CmiMyPartition()){
-    killTime = CmiWallTimer()+sec;
-    printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
-    CcdCallFnAfter(killLocal,NULL,sec*1000);
-  }
+  thisProxy[1].killAfter(sec);
+}
+
+void CkMemCheckPT::killAfter(double sec){
+  killTime = CmiWallTimer()+sec;
+  printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
+  CcdCallFnAfter(killLocal,NULL,sec*1000);
 }
 
 void CkMemCheckPT::generateSoftFailure(){
@@ -768,8 +766,6 @@ void CkMemCheckPT::startArrayCheckpoint(){
 
 void CkMemCheckPT::startCheckpoint(){
 #if CMK_CONVERSE_MPI
-  if(CkMyPe() == CpvAccess(_remoteCrashedNode))
-    CkPrintf("in start checkpointing!!!!\n");
   int size;
   {
     PUP::sizer p;
@@ -818,17 +814,17 @@ void CkMemCheckPT::startCheckpoint(){
     CpvAccess(recvdLocal) = 1;
     if(CpvAccess(use_checksum)){
       CpvAccess(localChecksum) = checksum;
-      char *chkpMsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
-      *(int *)(chkpMsg+CmiMsgHeaderSizeBytes) = CpvAccess(localChecksum);
       //only one reaplica will send
       if(CmiMyPartition()==0){
+        char *chkpMsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
+        *(int *)(chkpMsg+CmiMsgHeaderSizeBytes) = CpvAccess(localChecksum);
         CmiSetHandler(chkpMsg,recvRemoteChkpHandlerIdx);
         CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(int),chkpMsg);
       }
     }else{
-      envelope * env = (envelope *)(UsrToEnv((CkCheckPTMessage *)CkCopyMsg((void **)&msg)));
-      CkPackMessage(&env);
       if(CmiMyPartition()==0){
+        envelope * env = (envelope *)(UsrToEnv((CkCheckPTMessage *)CkCopyMsg((void **)&msg)));
+        CkPackMessage(&env);
         CmiSetHandler(env,recvRemoteChkpHandlerIdx);
         CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
       }
@@ -859,6 +855,8 @@ void CkMemCheckPT::startCheckpoint(){
         thisProxy[CkMyPe()].doneComparison(true);
         //thisProxy[CkMyPe()].doneComparison(false);
       }
+      
+      if(CpvAccess(buddyBuf)) delete CpvAccess(buddyBuf);
     }
     if(CkMyPe()==0)
       CmiPrintf("[%d][%d] comparison done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
@@ -1490,17 +1488,24 @@ void CkMemCheckPT::RollBack(){
       if(CmiNumPartition()==1){
         CkArrayCheckPTMessage * msg = (CkArrayCheckPTMessage *)CkCopyMsg((void **)&chkpTable[0]);
         packData = (char *)msg->packData;
+#if STREAMING_INFORMHOME && CK_NO_PROC_POOL
+        recoverAll(packData,gmap,imap);
+#else
+        recoverAll(packData);
+#endif
+        CkFreeMsg(msg);
       }
       else{
         int pointer = CpvAccess(curPointer);
         CkCheckPTMessage * msg = (CkCheckPTMessage *)CkCopyMsg((void **)&CpvAccess(chkpBuf)[pointer]);
         packData = msg->packData;
-      }
 #if STREAMING_INFORMHOME && CK_NO_PROC_POOL
-      recoverAll(packData,gmap,imap);
+        recoverAll(packData,gmap,imap);
 #else
-      recoverAll(packData);
+        recoverAll(packData);
 #endif
+        CkFreeMsg(msg);
+      }
 #endif
 #if STREAMING_INFORMHOME && CK_NO_PROC_POOL
       for (int i=0; i<CkNumPes(); i++) {
@@ -1611,10 +1616,8 @@ void CkMemCheckPT::RollBack(){
       }
       //inject next failure
       if(killFlag==2){
-        if(CkMyPe()==0){
-          replicaInjectFailure();
-        }
-        thisProxy[CkMyPe()].generateFailure();
+        if(CkMyPe()==0&&CmiMyPartition()==0)
+          thisProxy[CkMyPe()].generateFailure();
       }
 #endif
 
@@ -1831,6 +1834,7 @@ void CkMemCheckPT::RollBack(){
       }*/
       return result;
     }
+
     int getChecksum(char * buf){
       PUP::checker pchecker(buf);
       pchecker.skip();
@@ -1866,6 +1870,7 @@ void CkMemCheckPT::RollBack(){
             //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
           }
         }
+        CmiFree(msg);
       }else{
         envelope *env = (envelope *)msg;
         CkUnpackMessage(&env);
@@ -1880,7 +1885,7 @@ void CkMemCheckPT::RollBack(){
             CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
             //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
           }
-          delete chkpMsg;
+          CmiFree(msg);
           if(CkMyPe()==0)
             CmiPrintf("[%d][%d] comparison done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
         }else{
@@ -1993,6 +1998,8 @@ void CkMemCheckPT::RollBack(){
         CkPrintf("[%d] receive recover proc data at %lf\n", CkMyPe(), CmiWallTimer());
       }
       else{
+        //shud never happen
+        CmiAbort("non crashed pe received proc data");
         if(CpvAccess(localProcChkpBuf)[pointer]) delete CpvAccess(localProcChkpBuf)[pointer];
         CpvAccess(localProcChkpBuf)[pointer] = procMsg; 
         //_handleProcData(p,CmiFalse);
@@ -2078,6 +2085,7 @@ void CkMemCheckPT::RollBack(){
     }
     
     static void askRecoverDataHandler(char * msg){
+      CmiFree(msg);
       if(CmiMyPe() == CpvAccess(_remoteCrashedNode))
         CmiPrintf("[%d][%d] receive replica phase change at %lf\n",CmiMyPartition(),CmiMyPe(),CmiWallTimer());
       if(CpvAccess(resilience)!=1){
@@ -2579,11 +2587,7 @@ void CkMemCheckPT::RollBack(){
             printf("[%d][%d] KillLocal called at %.6lf \n",CmiMyPartition(),CkMyPe(),CmiWallTimer());          
             CkDieNow();
           }else{
-            //next failure
             if(killFlag == 2){
-              //CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
-              //checkptMgr.generateFailure();
-              //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->replicaInjectFailure();
               //delay for 2s 
               CcdCallFnAfter(killLocal,NULL,2*1000);        
             }
@@ -2604,7 +2608,7 @@ void CkMemCheckPT::RollBack(){
           CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->softFailureInjected = true;
         }else{
           //next failure
-          CcdCallFnAfter(killLocal,NULL,2*1000);
+          CcdCallFnAfter(injectSoftFailure,NULL,2*1000);
         }
       }
 #endif
index 35f32faa8582fd8917a5a5a3b7017c3f2bef4a60..fa8a63b7cd0338ba16f3460f6f983612cdf8b28e 100644 (file)
@@ -30,6 +30,7 @@ module CkMemCheckpoint {
        entry void report();
        // restart
         entry [expedited] void restart(int);
+        entry [expedited] void killAfter(double);
        entry [reductiontarget] void resetReductionMgr();
        entry [reductiontarget] void removeArrayElements();
        entry [reductiontarget] void recoverBuddies();
index 2b0147ff9e025b2ecb34a5f4da6b94bee698ca50..b2e6346f8211fea404be251990059f0b7ee126a9 100644 (file)
@@ -72,6 +72,7 @@ public:
   inline int BuddyPE(int pe);
   //void doItNow(int sp, CkCallback &);
   void doItNow(int sp);
+  void killAfter(double);
   void chkpLocalStart();
   void restart(int diePe);
   void removeArrayElements();
index d493ac1ab3230639a86712c4bdcfe0200bb3dd88..2bd7290d058d233767837d3cb75504fe68ddefae 100644 (file)
@@ -162,6 +162,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
            //     fault_bytes++;
               }
             }
+            delete p2;
           }    
           break;       
         case Tint:
@@ -179,6 +180,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
          //       fault_bytes++;
               }
             }
+            delete p2;
           }
           break;
         case Tchar:
@@ -196,6 +198,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
         //        fault_bytes++;
               }
             }
+            delete p2;
           }
           break;
         default: