inject failure according to exponential distribution
authorXiang Ni <xiangni2@illinois.edu>
Sun, 31 Mar 2013 03:36:46 +0000 (22:36 -0500)
committerXiang Ni <xiangni2@illinois.edu>
Sun, 31 Mar 2013 03:36:46 +0000 (22:36 -0500)
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.ci
src/ck-core/ckmemcheckpoint.h
src/ck-core/init.C

index 2ab62cb990b2ce487e4aa40c490350f58fbad29f..b3aef384dadd3aa5d359f90f18fa4e355a9a98b5 100644 (file)
@@ -89,7 +89,7 @@ CkCallback CkMemCheckPT::cpCallback;
 int _memChkptOn = 1;                   // checkpoint is on or off
 
 CkGroupID ckCheckPTGroupID;            // readonly
-
+unsigned int failureSeed;
 static int checkpointed = 0;
 
 /// @todo the following declarations should be moved into a separate file for all 
@@ -100,8 +100,13 @@ static int checkpointed = 0;
 char *killFile;                                               
 // flag for the kill file         
 int killFlag=0;
+// flag for failure distributiona
+char * failureDist;
+// Meantimr between failure
+int MTBF;
 // variable for storing the killing time
 double killTime=0.0;
+extern void killLocal(void *_dummy,double curWallTime);
 #endif
 
 #ifdef CKLOCMGR_LOOP
@@ -160,6 +165,7 @@ static int replicaChkpStartHandlerIdx;
 static int replicaDieBcastHandlerIdx;
 static int replicaRecoverHandlerIdx;
 static int replicaChkpDoneHandlerIdx;
+static int replicaBeginFailureInjectionHandlerIdx;
 static int recoverRemoteProcDataHandlerIdx;
 static int recoverRemoteArrayDataHandlerIdx;
 static int notifyHandlerIdx;
@@ -398,6 +404,33 @@ CkMemCheckPT::CkMemCheckPT(int w)
   maxIter = -1;
   recvIterCount = 0;
   localDecided = false;
+  if(killFlag == 2){
+    localSeed = failureSeed;
+    thisProxy[CkMyPe()].generateFailure();
+  }
+}
+
+void CkMemCheckPT::replicaInjectFailure(){
+  char * msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
+  CmiSetHandler(msg, replicaBeginFailureInjectionHandlerIdx);
+  CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(unsigned int),msg);
+}
+
+void CkMemCheckPT::generateFailure(){
+  if(strcmp(failureDist,"E")==0){
+    int rand1 = rand_r(&localSeed);
+    int rand2 = rand_r(&localSeed);
+    int rand3 = rand_r(&localSeed);
+    CkPrintf("randome %d %d %d\n", rand1, rand2, rand3);
+    int next_pe = (rand1)%CkNumPes();
+    int next_partition = (rand2)%2;
+    double sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
+    if(next_pe == CmiMyPe()&& next_partition == CmiMyPartition()){
+      killTime = CmiWallTimer()+sec;
+      printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
+      CcdCallFnAfter(killLocal,NULL,sec*1000);
+    }
+  }
 }
 
 CkMemCheckPT::~CkMemCheckPT()
@@ -418,6 +451,7 @@ void CkMemCheckPT::pup(PUP::er& p)
   p|cpCallback;                        // store callback
   p|where;                     // where to checkpoint
   p|peCount;
+  p|localSeed;
   if (p.isUnpacking()) {
     recvCount = 0;
 #if CMK_CONVERSE_MPI
@@ -1516,6 +1550,11 @@ void CkMemCheckPT::RollBack(){
         CmiPrintf("[%d] Restart finished in %f seconds at %f.\n", CkMyPe(), CkWallTimer()-restartT, CkWallTimer());
         fflush(stdout);
       }
+
+      if(CkMyPe()==0){
+        replicaInjectFailure();
+      }
+
       CKLOCMGR_LOOP(mgr->resumeFromChkp(););
       inRestarting = 0;
       maxIter = -1;
@@ -1536,6 +1575,8 @@ void CkMemCheckPT::RollBack(){
         lastPingTime = CmiWallTimer();
         CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
       }
+      //inject next failure
+      thisProxy[CkMyPe()].generateFailure();
 #endif
 
 #if CK_NO_PROC_POOL
@@ -1819,6 +1860,13 @@ void CkMemCheckPT::RollBack(){
       CmiFree(msg);
     }
 
+    
+    static void replicaBeginFailureInjectionHandler(char * msg){
+      CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
+      checkptMgr.generateFailure();
+      CmiFree(msg);
+    }
+    
     static void replicaChkpDoneHandler(char *msg){
       CmiPrintf("[%d][%d]receive replica checkpoint done\n",CmiMyPartition(),CmiMyPe());
       CpvAccess(remoteChkpDone) = 1;
@@ -2257,6 +2305,7 @@ void CkMemCheckPT::RollBack(){
           if (arg_where == CkCheckPoint_inDISK) {
             CkPrintf("Charm++> Double-disk Checkpointing. \n");
           }
+
           ckCheckPTGroupID = CProxy_CkMemCheckPT::ckNew(arg_where);
           CkPrintf("Charm++> CkMemCheckPTInit mainchare is created!\n");
 #endif
@@ -2394,6 +2443,7 @@ void CkMemCheckPT::RollBack(){
         replicaDieBcastHandlerIdx = CkRegisterHandler((CmiHandler)replicaDieBcastHandler);
         replicaRecoverHandlerIdx = CkRegisterHandler((CmiHandler)replicaRecoverHandler);
         replicaChkpDoneHandlerIdx = CkRegisterHandler((CmiHandler)replicaChkpDoneHandler);
+        replicaBeginFailureInjectionHandlerIdx = CkRegisterHandler((CmiHandler)replicaBeginFailureInjectionHandler);
         askPhaseHandlerIdx = CkRegisterHandler((CmiHandler)askPhaseHandler);
         recvPhaseHandlerIdx = CkRegisterHandler((CmiHandler)recvPhaseHandler);
         recoverRemoteProcDataHandlerIdx = CkRegisterHandler((CmiHandler)recoverRemoteProcDataHandler);
@@ -2481,7 +2531,14 @@ void CkMemCheckPT::RollBack(){
           CcdCallFnAfter(killLocal,NULL,(killTime-CmiWallTimer())*1000);        
         }else{ 
 #if CMK_CONVERSE_MPI
-          CkDieNow();
+          if(CkHasCheckpoints()&&!CkInCheckpointing()&&!CkInRestarting()){
+            CkDieNow();
+          }else{
+            //next failure
+            CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
+            checkptMgr.generateFailure();
+            CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->replicaInjectFailure();
+          }
 #else 
           kill(getpid(),SIGKILL);                                               
 #endif
@@ -2516,6 +2573,7 @@ void CkMemCheckPT::RollBack(){
         fclose(fp);
       }
 
+
 #if ! CMK_CONVERSE_MPI
       void CkDieNow()
       {
index 3ab7922cdc9a4836ead90a1fca95c0be4ff7fbb9..9182083a03b1868e22a3effe9a813e547e38b1ff 100644 (file)
@@ -3,7 +3,6 @@
 module CkMemCheckpoint {
 
   readonly CkGroupID ckCheckPTGroupID;
-
   message CkArrayCheckPTMessage {
         double packData[];
   };    
@@ -54,6 +53,7 @@ module CkMemCheckpoint {
        entry void quiescence(CkCallback&);
         entry void inmem_restore(CkArrayCheckPTMessage *m);
        entry void updateLocations(int n, CkGroupID g[n], CkArrayIndex idx[n],int nowOnPe);
+        entry void generateFailure();
   };
 
   initproc void CkRegisterRestartHandler();
index 09cdc5e4b3940c4cf47e98d5e3d8db235331f536..d0ecbc19090fb0c082188737fc8d4e2bdebf2bd0 100644 (file)
@@ -91,6 +91,9 @@ public:
   void gotReply();
   void inmem_restore(CkArrayCheckPTMessage *m);
   void updateLocations(int n, CkGroupID *g, CkArrayIndex *idx,int nowOnPe);
+  void generateFailure();
+  void replicaInjectFailure();
+  
   void resetLB(int diepe);
   int  isFailed(int pe);
   //void pupAllElements(PUP::er &p);
@@ -110,7 +113,7 @@ public:
   void recvMaxIter(int);
   void reachChkpIter();
   void startChkp();
-
+  
 public:
   static CkCallback  cpCallback;
 
@@ -142,6 +145,8 @@ private:
   int recvIterCount;
 
   int maxIter;
+
+  unsigned int localSeed;
     /// to use memory or disk checkpointing
   int    where;
 private:
index 2f6d86c9972176435d212684f4cac22c4426b179..aa21e78eb1c1a220b8ebf98bfc228e7ad3a8acec 100644 (file)
@@ -202,10 +202,17 @@ extern int BUFFER_TIME; //time spent waiting for buffered messages
 
 // flag for killing processes 
 extern int killFlag;
+extern unsigned int failureSeed;
 // file specifying the processes to be killed
 extern char *killFile;
+//failure distribution
+extern char * failureDist;
+//mean time between failure in seconds
+extern int MTBF;
 // function for reading the kill file
 void readKillFile();
+// function to genrate failures according to distribution
+void generateFailure();
 
 
 int _defaultObjectQ = 0;            // for obejct queue
@@ -274,6 +281,27 @@ static inline void _parseCommandLineOpts(char **argv)
       }
     }
   }
+  
+  //reading failure distribution
+  if(CmiGetArgStringDesc(argv,"+failureInject", &failureDist,"Runtime system injects failure")){
+    if (CmiGetArgIntDesc(argv,"+MTBF",&MTBF,"Mean time between failures")){
+      killFlag = 2;
+      if(CmiMyPe()==0){
+        if(strcmp(failureDist, "E")==0){
+          printf("Runtime system generates faiures according to exponential distribution\n");
+        }else if(strcmp(failureDist, "W")==0){
+          printf("Runtime system generates faiures according to weibull distribution\n");
+        }
+      }
+      int seed;
+      if (!CmiGetArgIntDesc(argv,"+failureSeed",&seed,"seed to generate the random sequencea")){
+        CkAbort("Needs to provide failure seed");
+      }else{
+        failureSeed = seed;
+      }
+    }
+  }
+
 #endif
 
   // shut down program in ring fashion to allow projections output w/o IO error
@@ -1399,9 +1427,11 @@ void _initCharm(int unused_argc, char **argv)
 
 
 #if __FAULT__
-       if(killFlag){                                                  
+       if(killFlag == 1){ 
                 readKillFile();                                        
         }
+
+
 #endif
 
 }