refine failure generator
[charm.git] / src / ck-core / ckmemcheckpoint.C
index 3be7319ec988bb918c8936015c2eee6dd106b55b..0157a4424f94fd590477a71294296fedfdb54345 100644 (file)
@@ -80,7 +80,6 @@ CpvDeclare(int, _remoteCrashedNode);
 // static, so that it is accessible from Converse part
 int CkMemCheckPT::inRestarting = 0;
 int CkMemCheckPT::inCheckpointing = 0;
-int CkMemCheckPT::aboutToDie = 0;
 int CkMemCheckPT::replicaAlive = 1;
 int CkMemCheckPT::inLoadbalancing = 0;
 double CkMemCheckPT::startTime;
@@ -114,7 +113,6 @@ double s_alpha;
 double killTime=0.0;
 extern void killLocal(void *_dummy,double curWallTime);
 extern void sendKillNotify(void *_dummy,double curWallTime);
-extern void verifyDeadth(void *_dummy,double curWallTime);
 extern void injectSoftFailure(void *_dummy,double curWallTime);
 #endif
 
@@ -179,8 +177,6 @@ static int recoverRemoteProcDataHandlerIdx;
 static int recoverRemoteArrayDataHandlerIdx;
 static int notifyHandlerIdx;
 static int replicaDyingNotifyHandlerIdx;
-static int replicaDeadNotifyHandlerIdx;
-static int replicaDyingBroadcastHandlerIdx;
 // compute the backup processor
 // FIXME: avoid crashed processors
 #if CMK_CONVERSE_MPI
@@ -408,8 +404,8 @@ CkMemCheckPT::CkMemCheckPT(int w)
 #if CMK_CONVERSE_MPI
   void pingBuddy();
   void pingCheckHandler();
-  //CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
-  //CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+  CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+  CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
 #endif
   chkpTable[0] = NULL;
   chkpTable[1] = NULL;
@@ -417,7 +413,6 @@ CkMemCheckPT::CkMemCheckPT(int w)
   recvIterCount = 0;
   localDecided = false;
   softFailureInjected = false;
-  chkpCount=0;
   if(killFlag == 2){
     localSeed = failureSeed;
     softLocalSeed = failureSeed*2;
@@ -431,70 +426,75 @@ CkMemCheckPT::CkMemCheckPT(int w)
   }
 }
 
+void CkMemCheckPT::replicaInjectFailure(){
+  char * msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
+  CmiSetHandler(msg, replicaBeginFailureInjectionHandlerIdx);
+  CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(unsigned int),msg);
+}
+
+void CkMemCheckPT::generateFailure(){
+  int rand3 = rand_r(&localSeed);
+  double sec;
+  if(strcmp(failureDist,"E")==0)
+    sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
+  else if(strcmp(failureDist,"W")==0)
+    sec = alpha*pow(-log(1.0f - ((double)rand3)/(long long int)(RAND_MAX)),1/beta);
+  thisProxy[1].killAfter(sec);
+}
+
+
+void CkMemCheckPT::replicaInjectFailure(){
+  char * msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
+  CmiSetHandler(msg, replicaBeginFailureInjectionHandlerIdx);
+  CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(unsigned int),msg);
+}
+
+void CkMemCheckPT::generateFailure(){
+  int rand3 = rand_r(&localSeed);
+  double sec;
+  if(strcmp(failureDist,"E")==0)
+    sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
+  else if(strcmp(failureDist,"W")==0)
+    sec = alpha*pow(-log(1.0f - ((double)rand3)/(long long int)(RAND_MAX)),1/beta);
+  killTime = CmiWallTimer()+sec;
+  printf("[%d][%d] inject hard failure after %.6lf s (MEMCKPT)\n",CmiMyPartition(),CkMyPe(),sec);
+  CcdCallFnAfter(sendKillNotify,NULL,(sec-1)*1000);
+}
+
 void sendKillNotify(void *_dummy,double curWallTime){
   if(CkInCheckpointing()||CpvAccess(localStarted)==1||CkInRestarting()){
     //in checkpointing or restart, delaying sending the notify
+    CkPrintf("[%d][%d]in checkpointing, recheck after 0.5s at %lf\n", CmiMyPartition(), CkMyPe(), CmiWallTimer());
     CcdCallFnAfter(sendKillNotify,NULL,500);
   }else{
     CkMemCheckPT::aboutToDie =  1;
-    //send the notify to my replica, so my replica won't communicate with me until the phase is clear
-    //char * msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
-    //CmiSetHandler(msg, replicaDyingNotifyHandlerIdx);
-    //CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes,msg);
     char * msg1 = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
     CmiSetHandler(msg1, replicaDyingNotifyHandlerIdx);
     CmiRemoteSyncSendAndFree(1,CmiMyPartition()^1,CmiMsgHeaderSizeBytes,msg1);
     
     //now it can die
-    double sec = 0.001;
+    double sec = 0.01;
     if(CmiWallTimer()<killTime){
       sec=killTime-CmiWallTimer();  
     }
     CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
     checkptMgr[1].killAfter(sec);
     
-    sec +=1;
-    CcdCallFnAfter(verifyDeadth,NULL,sec*1000); 
   }
 }
 
-void replicaDyingNotify(char * msg){
-  CkMemCheckPT::aboutToDie =  1;
-  CmiFree(msg);
-  //char * rmsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
-  //CmiSetHandler(rmsg, replicaDyingBroadcastHandlerIdx);
-  //CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes, (char *)rmsg);
-  //then norify everyone
+void CkMemCheckPT::killAfter(double sec){
+  killTime = CmiWallTimer()+sec;
+  printf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
+  CcdCallFnAfter(killLocal,NULL,sec*1000);
 }
 
-void replicaDyingBroadcast(char * msg){
+void replicaDyingNotify(char * msg){
   CkMemCheckPT::aboutToDie =  1;
   CmiFree(msg);
 }
 
-void replicaDeadNotify(char * msg){
-  CkMemCheckPT::aboutToDie =  0;
-  CmiFree(msg);
-}
-
-void CkMemCheckPT::generateFailure(){
-  int rand3 = rand_r(&localSeed);
-  double sec;
-  if(strcmp(failureDist,"E")==0)
-    sec = -log(1.0f - ((double)rand3)/(long long int)(RAND_MAX))*MTBF;
-  else if(strcmp(failureDist,"W")==0)
-    sec = alpha*pow(-log(1.0f - ((double)rand3)/(long long int)(RAND_MAX)),1/beta);
-  killTime = CmiWallTimer()+sec;
-  printf("[%d][%d] inject hard failure after %.6lf s (MEMCKPT)\n",CmiMyPartition(),CkMyPe(),sec);
-  CcdCallFnAfter(sendKillNotify,NULL,(sec-1)*1000);
-}
-
-void CkMemCheckPT::killAfter(double sec){
-  killTime = CmiWallTimer()+sec;
-  CkPrintf("[%d][%d] To be killed after %.6lf s (MEMCKPT) %lf\n",CmiMyPartition(),CkMyPe(),sec, killTime);
-  CcdCallFnAfter(killLocal,NULL,sec*1000);
-}
-
 void CkMemCheckPT::generateSoftFailure(){
   int rand = rand_r(&softLocalSeed);
   double sec;
@@ -528,16 +528,13 @@ void CkMemCheckPT::pup(PUP::er& p)
   p|peCount;
   p|localSeed;
   p|softLocalSeed;
-  p|chkpCount;
-  p|lastChkpTime;
-  p|chkpPeriod;
   if (p.isUnpacking()) {
     recvCount = 0;
 #if CMK_CONVERSE_MPI
     void pingBuddy();
     void pingCheckHandler();
//   CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
//   CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+    CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+    CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
 #endif
     maxIter = -1;
     recvIterCount = 0;
@@ -589,7 +586,7 @@ void CkMemCheckPT::startChkp(){
   if(CkInCheckpointing()){
     return;
   }
-  CkPrintf("[%d][%d]start checkpoint at %lf in %lf\n",CmiMyPartition(), CkMyPe(),CmiWallTimer(),CmiWallTimer()-startTime);
+  CkPrintf("start checkpoint at %lf in %lf\n",CmiWallTimer(),CmiWallTimer()-startTime);
   CkStartMemCheckpoint(cpCallback);
 }
 
@@ -1011,13 +1008,9 @@ void CkMemCheckPT::doneBothComparison(){
   inCheckpointing = 0;
   notifyReplica = 0;
   if(CkMyPe() == 0){
-    CmiPrintf("[%d][%d] Checkpoint finished in %f seconds at %lf, checkpoint size %d, memory usage %lf, sending callback ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime,CmiWallTimer(),size, CmiMemoryUsage()/1048576.0);
+    CmiPrintf("[%d][%d] Checkpoint finished in %f seconds at %lf, checkpoint size %d, sending callback ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime,CmiWallTimer(),size);
   }
   CKLOCMGR_LOOP(mgr->resumeFromChkp(););//TODO wait until the replica finish the checkpoint
-  if(chkpCount!=0)
-    chkpPeriod = CmiWallTimer()-lastChkpTime;
-  chkpCount++;
-  lastChkpTime = CmiWallTimer();
 }
 
 void CkMemCheckPT::RollBack(){
@@ -1666,7 +1659,7 @@ void CkMemCheckPT::RollBack(){
       }
       if (CmiMyPe() == BuddyPE(thisFailedPe)) {
         lastPingTime = CmiWallTimer();
-//        CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+        CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
       }
       //inject next failure
       if(killFlag==2){
@@ -1732,7 +1725,6 @@ void CkMemCheckPT::RollBack(){
     void CkStartMemCheckpoint(CkCallback &cb)
     {
 #if CMK_MEM_CHECKPOINT
-      //only not letting the dying partition continue checkpoint
       if(CkMemCheckPT::aboutToDie&&CmiMyPartition()==0)
        return;
       CkPrintf("partition %d start checkpoint\n",CmiMyPartition());
@@ -1751,9 +1743,9 @@ void CkMemCheckPT::RollBack(){
       // store user callback and user data
       CkMemCheckPT::cpCallback = cb;
 
+
       //send to my replica that checkpoint begins 
       if(CkReplicaAlive()==1){
-        CkPrintf("[%d][%d]send checkpoint start notification to my partition\n",CmiMyPartition(), CkMyPe());
         char * msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
         CmiSetHandler(msg, replicaChkpStartHandlerIdx);
         CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes,msg);
@@ -1762,7 +1754,7 @@ void CkMemCheckPT::RollBack(){
       CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
       checkptMgr.chkpLocalStart();
       // broadcast to start check pointing
-      if(CmiNumPartition()==1||(CmiNumPartition()==2&&CpvAccess(remoteStarted)==1)||(CkReplicaAlive()==0)){
+      if(CmiNumPartition()==1||(CmiNumPartition()==2&&CpvAccess(remoteStarted)==1)||CkReplicaAlive()==0){
         CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
         checkptMgr.doItNow(CkMyPe());
       }
@@ -2187,13 +2179,8 @@ void CkMemCheckPT::RollBack(){
          char * msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
          CmiSetHandler(msg, replicaChkpStartHandlerIdx);
          CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes,msg);
-       }
-       {
           CkMemCheckPT::aboutToDie = 0;
-          //char * msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
-          //CmiSetHandler(msg, replicaDeadNotifyHandlerIdx);
-          //CmiRemoteSyncSendAndFree(0,CmiMyPartition(),CmiMsgHeaderSizeBytes,msg);
-        }
+       }
       }
     }
     // called on crashed processor
@@ -2477,7 +2464,6 @@ void CkMemCheckPT::RollBack(){
 #if CMK_MEM_CHECKPOINT
       // notify
       CkMemCheckPT::inRestarting = 1;
-      CkMemCheckPT::aboutToDie =  0;
       int diepe = *(int *)(msg+CmiMsgHeaderSizeBytes);
       notify_crash(diepe);
       // send message to crash pe to let it restart
@@ -2561,9 +2547,6 @@ void CkMemCheckPT::RollBack(){
         //for replica
         recvRemoteChkpHandlerIdx = CkRegisterHandler((CmiHandler)recvRemoteChkpHandler);
         replicaDieHandlerIdx = CkRegisterHandler((CmiHandler)replicaDieHandler);
-        replicaDyingNotifyHandlerIdx = CkRegisterHandler((CmiHandler)replicaDyingNotify);
-        replicaDeadNotifyHandlerIdx = CkRegisterHandler((CmiHandler)replicaDeadNotify);
-        replicaDyingBroadcastHandlerIdx = CkRegisterHandler((CmiHandler)replicaDyingBroadcast);
         replicaChkpStartHandlerIdx = CkRegisterHandler((CmiHandler)replicaChkpStartHandler);
         replicaDieBcastHandlerIdx = CkRegisterHandler((CmiHandler)replicaDieBcastHandler);
         replicaRecoverHandlerIdx = CkRegisterHandler((CmiHandler)replicaRecoverHandler);
@@ -2573,6 +2556,7 @@ void CkMemCheckPT::RollBack(){
         recvPhaseHandlerIdx = CkRegisterHandler((CmiHandler)recvPhaseHandler);
         recoverRemoteProcDataHandlerIdx = CkRegisterHandler((CmiHandler)recoverRemoteProcDataHandler);
         recoverRemoteArrayDataHandlerIdx = CkRegisterHandler((CmiHandler)recoverRemoteArrayDataHandler);
+        replicaDyingNotifyHandlerIdx = CkRegisterHandler((CmiHandler)replicaDyingNotify);
 
 #if CMK_CONVERSE_MPI
         pingHandlerIdx = CkRegisterHandler((CmiHandler)pingHandler);
@@ -2657,7 +2641,7 @@ void CkMemCheckPT::RollBack(){
         }else{ 
 #if CMK_CONVERSE_MPI
           printf("[%d][%d] KillLocal called at %.6lf \n",CmiMyPartition(),CkMyPe(),CmiWallTimer());          
-         CkDieNow();
+          CkDieNow();
 #else 
           kill(getpid(),SIGKILL);                                               
 #endif
@@ -2668,21 +2652,6 @@ void CkMemCheckPT::RollBack(){
         CmiAbort("kill() not supported!");
       }
 #endif
-
-      void verifyDeadth(void * _dummy, double curWallTime){
-        char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
-        *(int *)(msg+CmiMsgHeaderSizeBytes) = 1;
-        CmiSetHandler(msg, buddyDieHandlerIdx);
-        CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes+sizeof(int), (char *)msg);
-        //send to everyone in the other world
-        if(CmiNumPartition()!=1){
-          char * rMsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
-          *(int *)(rMsg+CmiMsgHeaderSizeBytes) = 1;
-          CmiSetHandler(rMsg, replicaDieHandlerIdx);
-          CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(int),(char *)rMsg);
-       }
-      }
-
       void injectSoftFailure(void *_dummy,double curWallTime){
         if(!CkInCheckpointing()&&!CkInRestarting()){
           CkPrintf("soft failure injected\n");