inject soft and hard failures according to exponential and weibull distribution
authorXiang Ni <xiangni2@illinois.edu>
Fri, 5 Apr 2013 05:45:57 +0000 (00:45 -0500)
committerXiang Ni <xiangni2@illinois.edu>
Fri, 5 Apr 2013 05:45:57 +0000 (00:45 -0500)
src/arch/mpi/machine.c
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.ci
src/ck-core/ckmemcheckpoint.h
src/ck-core/init.C

index a2f0f1ab83dfb15bfb57bb8d3b3db93c45d8c850..8fdae56915609dffd5e06fcf4710c7ef2643d7e4 100644 (file)
@@ -2001,14 +2001,13 @@ int isRankDie(int rank){
 void CkDieNow()
 {
 #ifdef CMK_MEM_CHECKPOINT || (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)
-    CmiPrintf("[%d] die now.\n", CmiMyPe());
+    CmiPrintf("[%d][%d] die now.\n",CmiMyPartition(), CmiMyPe());
     fflush(stdout);
       /* release old messages */
     while (!CmiAllAsyncMsgsSent()) {
         PumpMsgs();
         CmiReleaseSentMessages();
     }
-    CmiPrintf("[%d] die now before barrier\n", CmiMyPe());
     MPI_Barrier(charmComm);
     MPI_Finalize();
     exit(0);
index 1d3138b30f7bc9a7f5a1a11a640782f706cf7a0b..12c85402ab3d6276ecd6b48e56f61eaec88d2021 100644 (file)
@@ -104,9 +104,15 @@ int killFlag=0;
 char * failureDist;
 // Meantimr between failure
 int MTBF;
+int SMTBF;
+//shape parameter for Weibull distribution
+double beta;
+double alpha;
+double s_alpha;
 // variable for storing the killing time
 double killTime=0.0;
 extern void killLocal(void *_dummy,double curWallTime);
+extern void injectSoftFailure(void *_dummy,double curWallTime);
 #endif
 
 #ifdef CKLOCMGR_LOOP
@@ -404,9 +410,16 @@ CkMemCheckPT::CkMemCheckPT(int w)
   maxIter = -1;
   recvIterCount = 0;
   localDecided = false;
+  softFailureInjected = false;
   if(killFlag == 2){
     localSeed = failureSeed;
     thisProxy[CkMyPe()].generateFailure();
+    
+    if(SMTBF!=-1 && CmiMyPartition()==1 && CkMyPe()==0){
+      softLocalSeed = failureSeed*2;
+      thisProxy[CkMyPe()].generateSoftFailure();
+    }
+
   }
 }
 
@@ -417,20 +430,37 @@ void CkMemCheckPT::replicaInjectFailure(){
 }
 
 void CkMemCheckPT::generateFailure(){
+  int rand1 = rand_r(&localSeed);
+  int rand2 = rand_r(&localSeed);
+  int rand3 = rand_r(&localSeed);
+  int next_pe = (rand1)%CkNumPes();
+  if(next_pe == 0){
+    next_pe = 1;
+  }
+  int next_partition = (rand2)%2;
+  double sec;
+  if(strcmp(failureDist,"E")==0)
+    sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
+  else if(strcmp(failureDist,"W")==0)
+    sec = alpha*pow(-log(1.0f - ((double)rand3)/(long long int)(RAND_MAX)),1/beta);
+  if(next_pe == CmiMyPe()&& next_partition == CmiMyPartition()){
+    killTime = CmiWallTimer()+sec;
+    printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
+    CcdCallFnAfter(killLocal,NULL,sec*1000);
+  }
+}
+
+void CkMemCheckPT::generateSoftFailure(){
+  int rand = rand_r(&softLocalSeed);
+  double sec;
   if(strcmp(failureDist,"E")==0){
-    int rand1 = rand_r(&localSeed);
-    int rand2 = rand_r(&localSeed);
-    int rand3 = rand_r(&localSeed);
-    CkPrintf("randome %d %d %d\n", rand1, rand2, rand3);
-    int next_pe = (rand1)%CkNumPes();
-    int next_partition = (rand2)%2;
-    double sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
-    if(next_pe == CmiMyPe()&& next_partition == CmiMyPartition()){
-      killTime = CmiWallTimer()+sec;
-      printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
-      CcdCallFnAfter(killLocal,NULL,sec*1000);
-    }
+    sec = -log(1.0f - ((double)rand)/(long long int)(RAND_MAX))*SMTBF;
+  }
+  else{
+    sec = s_alpha*pow(-log(1.0f - ((double)rand)/(long long int)(RAND_MAX)),1/beta);
   }
+  printf("[%d][%d] inject soft failure after %.6lf s (MEMCKPT)\n",CmiMyPartition(),CkMyPe(),sec);
+  CcdCallFnAfter(injectSoftFailure,NULL,sec*1000);
 }
 
 CkMemCheckPT::~CkMemCheckPT()
@@ -452,6 +482,7 @@ void CkMemCheckPT::pup(PUP::er& p)
   p|where;                     // where to checkpoint
   p|peCount;
   p|localSeed;
+  p|softLocalSeed;
   if (p.isUnpacking()) {
     recvCount = 0;
 #if CMK_CONVERSE_MPI
@@ -779,8 +810,10 @@ void CkMemCheckPT::startCheckpoint(){
   pointer = CpvAccess(curPointer);
   if(CpvAccess(chkpBuf)[pointer]) delete CpvAccess(chkpBuf)[pointer];
   CpvAccess(chkpBuf)[pointer] = msg;
+  
   if(CkMyPe()==0)
     CmiPrintf("[%d][%d] local checkpoint done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
+  
   if(CkReplicaAlive()==1){
     CpvAccess(recvdLocal) = 1;
     if(CpvAccess(use_checksum)){
@@ -814,7 +847,8 @@ void CkMemCheckPT::startCheckpoint(){
         thisProxy[CkMyPe()].doneComparison(true);
       }
       else{
-        thisProxy[CkMyPe()].doneComparison(false);
+        thisProxy[CkMyPe()].doneComparison(true);
+        //thisProxy[CkMyPe()].doneComparison(false);
       }
     }else{
       if(CpvAccess(buddyBuf)->len == size && compare((char *)(CpvAccess(chkpBuf)[pointer]->packData),(char *)(CpvAccess(buddyBuf)->packData))){
@@ -822,7 +856,8 @@ void CkMemCheckPT::startCheckpoint(){
       }
       else{
         //CkPrintf("[%d][%d] failed the test pointer %d \n",CmiMyPartition(),CkMyPe(),pointer);
-        thisProxy[CkMyPe()].doneComparison(false);
+        thisProxy[CkMyPe()].doneComparison(true);
+        //thisProxy[CkMyPe()].doneComparison(false);
       }
     }
     if(CkMyPe()==0)
@@ -888,20 +923,20 @@ void CkMemCheckPT::doneComparison(bool ret){
 
 void CkMemCheckPT::doneRComparison(int ret){
   CkPrintf("[%d][%d] doneRComparison\n", CmiMyPartition(), CkMyPe());
-  //   if(CpvAccess(curPointer) == 0){
-  //if(ret==CkNumPes()){
+  if(ret==CkNumPes()&&!softFailureInjected){
     CpvAccess(localChkpDone) = 1;
     if(CpvAccess(remoteChkpDone) ==1){
       thisProxy.doneBothComparison();
     }else{
       CmiPrintf("[%d][%d] Local checkpoint finished in %f seconds at %lf, waiting for replica ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime,CmiWallTimer());
     }
-  //}
-  /*else{
+  }
+  else{
+    ret = 0;
     CkPrintf("[%d][%d] going to RollBack %d at %lf checkpoint in %lf\n", CmiMyPartition(),CkMyPe(),ret,CmiWallTimer(), CmiWallTimer()-startTime);
     startTime = CmiWallTimer();
     thisProxy.RollBack();
-  }*/
+  }
   if(notifyReplica == 0){
     //notify the replica am done
     char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
@@ -965,10 +1000,12 @@ void CkMemCheckPT::RollBack(){
       CmiAssert(mgr);
       mgr->resume(idx,p,CmiFalse,CmiTrue,CmiTrue);
     }
-    }
-    CkCallback cb(CkReductionTarget(CkMemCheckPT,recoverFromSoftFailure),thisProxy);
-    contribute(cb);
   }
+  
+  softFailureInjected = false;
+  CkCallback cb(CkReductionTarget(CkMemCheckPT,recoverFromSoftFailure),thisProxy);
+  contribute(cb);
+}
 
   void CkMemCheckPT::notifyReplicaDie(int pe){
     replicaAlive = 0;
@@ -1616,6 +1653,9 @@ void CkMemCheckPT::RollBack(){
         CmiPrintf("[%d][%d] Recover From soft failures in %lf, sending callback ... \n", CmiMyPartition(),CkMyPe(),CmiWallTimer()-startTime);
       }
       CKLOCMGR_LOOP(mgr->resumeFromChkp(););
+      if(CmiMyPartition()==1 && CkMyPe()==0){
+        thisProxy[CkMyPe()].generateSoftFailure();
+      }
     }
     // called only on 0
     void CkMemCheckPT::quiescence(CkCallback &cb)
@@ -1822,7 +1862,8 @@ void CkMemCheckPT::RollBack(){
             CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
           }
           else{
-            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
+            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
+            //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
           }
         }
       }else{
@@ -1836,7 +1877,8 @@ void CkMemCheckPT::RollBack(){
             CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
           }else
           {
-            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
+            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
+            //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
           }
           delete chkpMsg;
           if(CkMyPe()==0)
@@ -2528,19 +2570,22 @@ void CkMemCheckPT::RollBack(){
 #ifdef CMK_MEM_CHECKPOINT
 #if CMK_HAS_GETPID
       void killLocal(void *_dummy,double curWallTime){
-        printf("[%d] KillLocal called at %.6lf \n",CkMyPe(),CmiWallTimer());          
         if(CmiWallTimer()<killTime-1){
+          printf("[%d][%d] KillLocal called at %.6lf \n",CmiMyPartition(),CkMyPe(),CmiWallTimer());          
           CcdCallFnAfter(killLocal,NULL,(killTime-CmiWallTimer())*1000);        
         }else{ 
 #if CMK_CONVERSE_MPI
-          if(CkHasCheckpoints()&&!CkInCheckpointing()&&!CkInRestarting()){
+          if(!CkInCheckpointing()&&!CkInRestarting()){
+            printf("[%d][%d] KillLocal called at %.6lf \n",CmiMyPartition(),CkMyPe(),CmiWallTimer());          
             CkDieNow();
           }else{
             //next failure
             if(killFlag == 2){
-              CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
-              checkptMgr.generateFailure();
-              CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->replicaInjectFailure();
+              //CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
+              //checkptMgr.generateFailure();
+              //CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->replicaInjectFailure();
+              //delay for 2s 
+              CcdCallFnAfter(killLocal,NULL,2*1000);        
             }
           }
 #else 
@@ -2553,6 +2598,15 @@ void CkMemCheckPT::RollBack(){
         CmiAbort("kill() not supported!");
       }
 #endif
+      void injectSoftFailure(void *_dummy,double curWallTime){
+        if(!CkInCheckpointing()&&!CkInRestarting()){
+          CkPrintf("soft failure injected\n");
+          CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->softFailureInjected = true;
+        }else{
+          //next failure
+          CcdCallFnAfter(killLocal,NULL,2*1000);
+        }
+      }
 #endif
 
 #ifdef CMK_MEM_CHECKPOINT
index 9182083a03b1868e22a3effe9a813e547e38b1ff..35f32faa8582fd8917a5a5a3b7017c3f2bef4a60 100644 (file)
@@ -54,6 +54,7 @@ module CkMemCheckpoint {
         entry void inmem_restore(CkArrayCheckPTMessage *m);
        entry void updateLocations(int n, CkGroupID g[n], CkArrayIndex idx[n],int nowOnPe);
         entry void generateFailure();
+        entry void generateSoftFailure();
   };
 
   initproc void CkRegisterRestartHandler();
index d0ecbc19090fb0c082188737fc8d4e2bdebf2bd0..2b0147ff9e025b2ecb34a5f4da6b94bee698ca50 100644 (file)
@@ -92,6 +92,7 @@ public:
   void inmem_restore(CkArrayCheckPTMessage *m);
   void updateLocations(int n, CkGroupID *g, CkArrayIndex *idx,int nowOnPe);
   void generateFailure();
+  void generateSoftFailure();
   void replicaInjectFailure();
   
   void resetLB(int diepe);
@@ -129,6 +130,7 @@ public:
   int localMaxIter;
 
   int notifyReplica;
+  bool softFailureInjected;
 private:
   CkVec<CkCheckPTInfo *> ckTable;
   CkArrayCheckPTMessage * chkpTable[2];
@@ -147,6 +149,7 @@ private:
   int maxIter;
 
   unsigned int localSeed;
+  unsigned int softLocalSeed;
     /// to use memory or disk checkpointing
   int    where;
 private:
index aa21e78eb1c1a220b8ebf98bfc228e7ad3a8acec..fa7fc13d5fbbc25314c856d565b6e08e0e4adcf8 100644 (file)
@@ -209,6 +209,11 @@ extern char *killFile;
 extern char * failureDist;
 //mean time between failure in seconds
 extern int MTBF;
+extern int SMTBF;
+//shape and scale parameter for Weibull distribution
+extern double beta;
+extern double alpha;
+extern double s_alpha;
 // function for reading the kill file
 void readKillFile();
 // function to genrate failures according to distribution
@@ -285,20 +290,45 @@ static inline void _parseCommandLineOpts(char **argv)
   //reading failure distribution
   if(CmiGetArgStringDesc(argv,"+failureInject", &failureDist,"Runtime system injects failure")){
     if (CmiGetArgIntDesc(argv,"+MTBF",&MTBF,"Mean time between failures")){
+      if(CmiMyPe()==0)
+        printf("Mean time between hard failures is %d\n",MTBF);
       killFlag = 2;
-      if(CmiMyPe()==0){
-        if(strcmp(failureDist, "E")==0){
+      if(strcmp(failureDist, "E")==0){
+        if(CmiMyPe()==0)
           printf("Runtime system generates faiures according to exponential distribution\n");
-        }else if(strcmp(failureDist, "W")==0){
+      }else if(strcmp(failureDist, "W")==0){
+        if(CmiMyPe()==0)
           printf("Runtime system generates faiures according to weibull distribution\n");
+        if(!CmiGetArgDoubleDesc(argv,"+shape",&beta,"shape parameter for Weibull distribution")){
+          if(CmiMyPe()==0)
+            CmiAbort("should provide shape parameter for Weibull distribution");
+        }else{
+          alpha = MTBF/tgammaf(1+1/beta);
         }
+      }else{
+        if(CmiMyPe()==0)
+          CmiAbort("unknown failure distribution");
       }
       int seed;
       if (!CmiGetArgIntDesc(argv,"+failureSeed",&seed,"seed to generate the random sequencea")){
-        CkAbort("Needs to provide failure seed");
+        if(CmiMyPe()==0)
+          CmiAbort("Needs to provide failure seed");
       }else{
         failureSeed = seed;
       }
+      if (CmiGetArgIntDesc(argv,"+SMTBF",&SMTBF,"Mean time between failures")){
+        if(CmiMyPe()==0)
+          printf("Mean time between soft failures is %d\n",SMTBF);
+        if(strcmp(failureDist,"W")==0){
+          s_alpha = SMTBF/tgammaf(1+1/beta);
+        }
+      }else{
+        SMTBF = -1;
+      }
+    }
+    else{
+      if(CmiMyPe()==0)
+        CmiAbort("should specify the MTBF for failure injections");
     }
   }