inject soft and hard failures according to exponential and weibull distribution
[charm.git] / src / ck-core / ckmemcheckpoint.C
index 1d3138b30f7bc9a7f5a1a11a640782f706cf7a0b..12c85402ab3d6276ecd6b48e56f61eaec88d2021 100644 (file)
@@ -104,9 +104,15 @@ int killFlag=0;
 char * failureDist;
 // Meantimr between failure
 int MTBF;
+int SMTBF;
+//shape parameter for Weibull distribution
+double beta;
+double alpha;
+double s_alpha;
 // variable for storing the killing time
 double killTime=0.0;
 extern void killLocal(void *_dummy,double curWallTime);
+extern void injectSoftFailure(void *_dummy,double curWallTime);
 #endif
 
 #ifdef CKLOCMGR_LOOP
@@ -404,9 +410,16 @@ CkMemCheckPT::CkMemCheckPT(int w)
   maxIter = -1;
   recvIterCount = 0;
   localDecided = false;
+  softFailureInjected = false;
   if(killFlag == 2){
     localSeed = failureSeed;
     thisProxy[CkMyPe()].generateFailure();
+    
+    if(SMTBF!=-1 && CmiMyPartition()==1 && CkMyPe()==0){
+      softLocalSeed = failureSeed*2;
+      thisProxy[CkMyPe()].generateSoftFailure();
+    }
+
   }
 }
 
@@ -417,20 +430,37 @@ void CkMemCheckPT::replicaInjectFailure(){
 }
 
 void CkMemCheckPT::generateFailure(){
+  int rand1 = rand_r(&localSeed);
+  int rand2 = rand_r(&localSeed);
+  int rand3 = rand_r(&localSeed);
+  int next_pe = (rand1)%CkNumPes();
+  if(next_pe == 0){
+    next_pe = 1;
+  }
+  int next_partition = (rand2)%2;
+  double sec;
+  if(strcmp(failureDist,"E")==0)
+    sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
+  else if(strcmp(failureDist,"W")==0)
+    sec = alpha*pow(-log(1.0f - ((double)rand3)/(long long int)(RAND_MAX)),1/beta);
+  if(next_pe == CmiMyPe()&& next_partition == CmiMyPartition()){
+    killTime = CmiWallTimer()+sec;
+    printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
+    CcdCallFnAfter(killLocal,NULL,sec*1000);
+  }
+}
+
+void CkMemCheckPT::generateSoftFailure(){
+  int rand = rand_r(&softLocalSeed);
+  double sec;
   if(strcmp(failureDist,"E")==0){
-    int rand1 = rand_r(&localSeed);
-    int rand2 = rand_r(&localSeed);
-    int rand3 = rand_r(&localSeed);
-    CkPrintf("randome %d %d %d\n", rand1, rand2, rand3);
-    int next_pe = (rand1)%CkNumPes();
-    int next_partition = (rand2)%2;
-    double sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
-    if(next_pe == CmiMyPe()&& next_partition == CmiMyPartition()){
-      killTime = CmiWallTimer()+sec;
-      printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
-      CcdCallFnAfter(killLocal,NULL,sec*1000);
-    }
+    sec = -log(1.0f - ((double)rand)/(long long int)(RAND_MAX))*SMTBF;
+  }
+  else{
+    sec = s_alpha*pow(-log(1.0f - ((double)rand)/(long long int)(RAND_MAX)),1/beta);
   }
+  printf("[%d][%d] inject soft failure after %.6lf s (MEMCKPT)\n",CmiMyPartition(),CkMyPe(),sec);
+  CcdCallFnAfter(injectSoftFailure,NULL,sec*1000);
 }
 
 CkMemCheckPT::~CkMemCheckPT()
@@ -452,6 +482,7 @@ void CkMemCheckPT::pup(PUP::er& p)
   p|where;                     // where to checkpoint
   p|peCount;
   p|localSeed;
+  p|softLocalSeed;
   if (p.isUnpacking()) {
     recvCount = 0;
 #if CMK_CONVERSE_MPI
@@ -779,8 +810,10 @@ void CkMemCheckPT::startCheckpoint(){
   pointer = CpvAccess(curPointer);
   if(CpvAccess(chkpBuf)[pointer]) delete CpvAccess(chkpBuf)[pointer];
   CpvAccess(chkpBuf)[pointer] = msg;
+  
   if(CkMyPe()==0)
     CmiPrintf("[%d][%d] local checkpoint done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
+  
   if(CkReplicaAlive()==1){
     CpvAccess(recvdLocal) = 1;
     if(CpvAccess(use_checksum)){
@@ -814,7 +847,8 @@ void CkMemCheckPT::startCheckpoint(){
         thisProxy[CkMyPe()].doneComparison(true);
       }
       else{
-        thisProxy[CkMyPe()].doneComparison(false);
+        thisProxy[CkMyPe()].doneComparison(true);
+        //thisProxy[CkMyPe()].doneComparison(false);
       }
     }else{
       if(CpvAccess(buddyBuf)->len == size && compare((char *)(CpvAccess(chkpBuf)[pointer]->packData),(char *)(CpvAccess(buddyBuf)->packData))){
@@ -822,7 +856,8 @@ void CkMemCheckPT::startCheckpoint(){
       }
       else{
         //CkPrintf("[%d][%d] failed the test pointer %d \n",CmiMyPartition(),CkMyPe(),pointer);
-        thisProxy[CkMyPe()].doneComparison(false);
+        thisProxy[CkMyPe()].doneComparison(true);
+        //thisProxy[CkMyPe()].doneComparison(false);
       }
     }
     if(CkMyPe()==0)
@@ -888,20 +923,20 @@ void CkMemCheckPT::doneComparison(bool ret){
 
 void CkMemCheckPT::doneRComparison(int ret){
   CkPrintf("[%d][%d] doneRComparison\n", CmiMyPartition(), CkMyPe());
-  //   if(CpvAccess(curPointer) == 0){
-  //if(ret==CkNumPes()){
+  if(ret==CkNumPes()&&!softFailureInjected){
     CpvAccess(localChkpDone) = 1;
     if(CpvAccess(remoteChkpDone) ==1){
       thisProxy.doneBothComparison();
     }else{
       CmiPrintf("[%d][%d] Local checkpoint finished in %f seconds at %lf, waiting for replica ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime,CmiWallTimer());
     }
-  //}
-  /*else{
+  }
+  else{
+    ret = 0;
     CkPrintf("[%d][%d] going to RollBack %d at %lf checkpoint in %lf\n", CmiMyPartition(),CkMyPe(),ret,CmiWallTimer(), CmiWallTimer()-startTime);
     startTime = CmiWallTimer();
     thisProxy.RollBack();
-  }*/
+  }
   if(notifyReplica == 0){
     //notify the replica am done
     char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
@@ -965,10 +1000,12 @@ void CkMemCheckPT::RollBack(){
       CmiAssert(mgr);
       mgr->resume(idx,p,CmiFalse,CmiTrue,CmiTrue);
     }
-    }
-    CkCallback cb(CkReductionTarget(CkMemCheckPT,recoverFromSoftFailure),thisProxy);
-    contribute(cb);
   }
+  
+  softFailureInjected = false;
+  CkCallback cb(CkReductionTarget(CkMemCheckPT,recoverFromSoftFailure),thisProxy);
+  contribute(cb);
+}
 
   void CkMemCheckPT::notifyReplicaDie(int pe){
     replicaAlive = 0;
@@ -1616,6 +1653,9 @@ void CkMemCheckPT::RollBack(){
         CmiPrintf("[%d][%d] Recover From soft failures in %lf, sending callback ... \n", CmiMyPartition(),CkMyPe(),CmiWallTimer()-startTime);
       }
       CKLOCMGR_LOOP(mgr->resumeFromChkp(););
+      if(CmiMyPartition()==1 && CkMyPe()==0){
+        thisProxy[CkMyPe()].generateSoftFailure();
+      }
     }
     // called only on 0
     void CkMemCheckPT::quiescence(CkCallback &cb)
@@ -1822,7 +1862,8 @@ void CkMemCheckPT::RollBack(){
             CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
           }
           else{
-            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
+            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
+            //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
           }
         }
       }else{
@@ -1836,7 +1877,8 @@ void CkMemCheckPT::RollBack(){
             CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
           }else
           {
-            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
+            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
+            //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
           }
           delete chkpMsg;
           if(CkMyPe()==0)
@@ -2528,19 +2570,22 @@ void CkMemCheckPT::RollBack(){
 #ifdef CMK_MEM_CHECKPOINT
 #if CMK_HAS_GETPID
       void killLocal(void *_dummy,double curWallTime){
-        printf("[%d] KillLocal called at %.6lf \n",CkMyPe(),CmiWallTimer());          
         if(CmiWallTimer()<killTime-1){
+          printf("[%d][%d] KillLocal called at %.6lf \n",CmiMyPartition(),CkMyPe(),CmiWallTimer());          
           CcdCallFnAfter(killLocal,NULL,(killTime-CmiWallTimer())*1000);        
         }else{ 
 #if CMK_CONVERSE_MPI
-          if(CkHasCheckpoints()&&!CkInCheckpointing()&&!CkInRestarting()){
+          if(!CkInCheckpointing()&&!CkInRestarting()){
+            printf("[%d][%d] KillLocal called at %.6lf \n",CmiMyPartition(),CkMyPe(),CmiWallTimer());          
             CkDieNow();
           }else{
             //next failure
             if(killFlag == 2){
-              CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
-              checkptMgr.generateFailure();
-              CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->replicaInjectFailure();
+              //CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
+              //checkptMgr.generateFailure();
+              //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->replicaInjectFailure();
+              //delay for 2s 
+              CcdCallFnAfter(killLocal,NULL,2*1000);        
             }
           }
 #else 
@@ -2553,6 +2598,15 @@ void CkMemCheckPT::RollBack(){
         CmiAbort("kill() not supported!");
       }
 #endif
+      void injectSoftFailure(void *_dummy,double curWallTime){
+        if(!CkInCheckpointing()&&!CkInRestarting()){
+          CkPrintf("soft failure injected\n");
+          CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->softFailureInjected = true;
+        }else{
+          //next failure
+          CcdCallFnAfter(killLocal,NULL,2*1000);
+        }
+      }
 #endif
 
 #ifdef CMK_MEM_CHECKPOINT