working version on BGP
authorXiang Ni <xini@login4.intrepid.alcf.anl.gov>
Sat, 19 Jan 2013 20:21:04 +0000 (20:21 +0000)
committerXiang Ni <xini@login4.intrepid.alcf.anl.gov>
Sat, 19 Jan 2013 20:21:04 +0000 (20:21 +0000)
src/ck-core/ckfaultinjector.C
src/ck-core/cklocation.C
src/ck-core/ckmemcheckpoint.C
src/conv-core/convcore.c

index 3fbe2795e96c3871988f4ca439e75f837ec55e7a..349d34142dd23c123e46bd0b7b36e1dbb511c41c 100644 (file)
@@ -53,10 +53,10 @@ extern "C" {
 
   void FI_injectFault() {
     CpvAccess(faultInjector)->injected = 0;
 
   void FI_injectFault() {
     CpvAccess(faultInjector)->injected = 0;
-    int rand_num = rand_r(&CpvAccess(faultInjector)->globalSeed);
-    if((rand_num%CkNumPes() == CkMyPe()) && (rand_num%CmiNumPartition() == CmiMyPartition())) {
+   // int rand_num = rand_r(&CpvAccess(faultInjector)->globalSeed);
+   // if((rand_num%CkNumPes() == CkMyPe()) && (rand_num%CmiNumPartition() == CmiMyPartition())) {
       CpvAccess(faultInjector)->injectFault();
       CpvAccess(faultInjector)->injectFault();
-    }
+   // }
 //     FI_stopInjection();
  /*   if(CpvAccess(faultInjector)->inject) {
       CcdCallFnAfter((CcdVoidFn)FI_injectFault, NULL, CpvAccess(faultInjector)->fault_period*1000);
 //     FI_stopInjection();
  /*   if(CpvAccess(faultInjector)->inject) {
       CcdCallFnAfter((CcdVoidFn)FI_injectFault, NULL, CpvAccess(faultInjector)->fault_period*1000);
@@ -76,12 +76,12 @@ FaultInjector::FaultInjector(char **argv) : fault_period(DEFAULT_FAULT_PERIOD),
     //CcdCallFnAfter((CcdVoidFn)FI_injectFault, NULL, fault_period*1000);
     injected = 1;
     int seed = 3877;
     //CcdCallFnAfter((CcdVoidFn)FI_injectFault, NULL, fault_period*1000);
     injected = 1;
     int seed = 3877;
-    if(!CmiGetArgInt(argv, "+inj_seed",&seed)){
+    /*if(!CmiGetArgInt(argv, "+inj_seed",&seed)){
       timeval tm;
       gettimeofday(&tm,NULL);
       seed = tm.tv_sec;
       timeval tm;
       gettimeofday(&tm,NULL);
       seed = tm.tv_sec;
-    }
-    globalSeed = seed%1000;
+    }*/
+    globalSeed = seed;
     localSeed = CkMyPe()*globalSeed;
   }
 }
     localSeed = CkMyPe()*globalSeed;
   }
 }
@@ -134,22 +134,24 @@ void FaultInjector::injectFault() {
   //in the target select a memory space
   CmiUInt8 memory = rand_r(&seed) % userZones[target].length;
 
   //in the target select a memory space
   CmiUInt8 memory = rand_r(&seed) % userZones[target].length;
 
-  char *location = (char*)userZones[target].start + memory;
+  //char *location = (char*)userZones[target].start + memory;
+  long long int *location = (long long int*)userZones[target].start;
   
 //  seed = seed* CmiWallTimer();
   int offset = rand_r(&seed) % 8;
 
   
 //  seed = seed* CmiWallTimer();
   int offset = rand_r(&seed) % 8;
 
-  unsigned char bugger = 0xFF;
-  bugger = bugger ^ ((unsigned char)1<<offset);
+  long long int  bugger = 0xFFFFFFFFFFFFFFFFLL;
+//  bugger = bugger ^ ((unsigned char)1<<offset);
 
 #if FI_DEBUG
   printf("[%d-%d] Injecting Fault in zone %d at memory %p offset %d, value %X size %d\n",CmiMyPartition(),CmiMyPe(),target,location,offset,(int)*location,userZones.size());
 #endif
 
 #if FI_DEBUG
   printf("[%d-%d] Injecting Fault in zone %d at memory %p offset %d, value %X size %d\n",CmiMyPartition(),CmiMyPe(),target,location,offset,(int)*location,userZones.size());
 #endif
-  if(*location & ((unsigned char)1<<offset)) {
+  /*if(*location & ((unsigned char)1<<offset)) {
     *location = *location & bugger;
   }else{
     *location = *location | ((unsigned char)1<<offset);
     *location = *location & bugger;
   }else{
     *location = *location | ((unsigned char)1<<offset);
-  }
+  }*/
+  *location = *location ^ bugger;
 #if FI_DEBUG
   printf("[%d-%d] Injected Fault in zone %d at memory %p offset %d, value %X size %d\n",CmiMyPartition(),CmiMyPe(),target,location,offset,(int)*location,userZones.size());
 #endif
 #if FI_DEBUG
   printf("[%d-%d] Injected Fault in zone %d at memory %p offset %d, value %X size %d\n",CmiMyPartition(),CmiMyPe(),target,location,offset,(int)*location,userZones.size());
 #endif
index 8d1e102c25796c19e4c96e95ca0ae245f4b74bb4..5fb4c435114696061eee925ca4383816bbd2eb50 100644 (file)
@@ -1100,7 +1100,9 @@ void CkMigratable::pup(PUP::er &p) {
        p | asyncEvacuate;
        if(p.isUnpacking()){myRec->AsyncEvacuate(asyncEvacuate);}
        
        p | asyncEvacuate;
        if(p.isUnpacking()){myRec->AsyncEvacuate(asyncEvacuate);}
        
-       p | atsync_chkp_iter;   
+       if(p.isUnpacking()){
+         atsync_chkp_iter = -1;
+       }
        if(p.isUnpacking()){
                resetForChkp();
        }
        if(p.isUnpacking()){
                resetForChkp();
        }
@@ -1245,6 +1247,9 @@ void CkMigratable::setChkpResumeClient(CkCallback & _cb)
 void CkMigratable::AtChkpSync()
 {
        if(usesChkpAtSync){
 void CkMigratable::AtChkpSync()
 {
        if(usesChkpAtSync){
+         if(CkMyPe()==0){
+//         CkPrintf("at chkp sync\n");
+         }
                if(CmiNumPartition()==1){
                        chkp_cb.send();
                        return;
                if(CmiNumPartition()==1){
                        chkp_cb.send();
                        return;
@@ -1297,6 +1302,9 @@ void CkMigratable::recvChkpIter(void * _iter){
                int iter = *(int *)_iter;
                nextChkpIter = iter;
                nextChkpDecided = true;
                int iter = *(int *)_iter;
                nextChkpIter = iter;
                nextChkpDecided = true;
+               if(CkMyPe()==0){
+                 CkPrintf("receive chkp iter %d %d\n",atsync_chkp_iter, nextChkpIter);
+               }
                if(atsync_chkp_iter>nextChkpIter){
                        CkAbort("impossible state in notify\n");
                }
                if(atsync_chkp_iter>nextChkpIter){
                        CkAbort("impossible state in notify\n");
                }
index bd30895ed2ae1d9413cd48d83a58bfe7c82b3c5b..26da67f9ed6878182f316eb20c62cd6926f8375f 100644 (file)
@@ -391,8 +391,8 @@ CkMemCheckPT::CkMemCheckPT(int w)
 #if CMK_CONVERSE_MPI
   void pingBuddy();
   void pingCheckHandler();
 #if CMK_CONVERSE_MPI
   void pingBuddy();
   void pingCheckHandler();
-  CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
-  CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+  //CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+  //CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
 #endif
   chkpTable[0] = NULL;
   chkpTable[1] = NULL;
 #endif
   chkpTable[0] = NULL;
   chkpTable[1] = NULL;
@@ -424,8 +424,8 @@ void CkMemCheckPT::pup(PUP::er& p)
 #if CMK_CONVERSE_MPI
     void pingBuddy();
     void pingCheckHandler();
 #if CMK_CONVERSE_MPI
     void pingBuddy();
     void pingCheckHandler();
-    CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
-    CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+    //CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+    //CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
 #endif
     maxIter = -1;
     recvIterCount = 0;
 #endif
     maxIter = -1;
     recvIterCount = 0;
@@ -436,6 +436,9 @@ void CkMemCheckPT::pup(PUP::er& p)
 void CkMemCheckPT::getIter(){
   localDecided = true;
   localMaxIter = maxIter+1;
 void CkMemCheckPT::getIter(){
   localDecided = true;
   localMaxIter = maxIter+1;
+  if(CkMyPe()==0){
+    CkPrintf("local max iter is %d\n",localMaxIter);
+  }
   contribute(sizeof(int),&localMaxIter,CkReduction::max_int,CkCallback(CkReductionTarget(CkMemCheckPT,recvMaxIter),thisProxy));
   int elemCount = CkCountChkpSyncElements();
   if(CkMyPe()==0)
   contribute(sizeof(int),&localMaxIter,CkReduction::max_int,CkCallback(CkReductionTarget(CkMemCheckPT,recvMaxIter),thisProxy));
   int elemCount = CkCountChkpSyncElements();
   if(CkMyPe()==0)
@@ -455,12 +458,15 @@ void CkMemCheckPT::recvIter(int iter){
 
 void CkMemCheckPT::recvMaxIter(int iter){
   localDecided = false;
 
 void CkMemCheckPT::recvMaxIter(int iter){
   localDecided = false;
+  if(CkMyPe()==0)
+    CkPrintf("checkpoint iteration is %d\n",iter);
   CKLOCMGR_LOOP(mgr->recvChkpIter(iter););
 }
 
 void CkMemCheckPT::reachChkpIter(){
   recvIterCount++;
   elemCount = CkCountChkpSyncElements();
   CKLOCMGR_LOOP(mgr->recvChkpIter(iter););
 }
 
 void CkMemCheckPT::reachChkpIter(){
   recvIterCount++;
   elemCount = CkCountChkpSyncElements();
+  //CkPrintf("[%d] received %d local %d\n",CkMyPe(),recvIterCount, elemCount);
   if(recvIterCount == elemCount){
     recvIterCount = 0;
     contribute(CkCallback(CkReductionTarget(CkMemCheckPT,startChkp),thisProxy[0]));
   if(recvIterCount == elemCount){
     recvIterCount = 0;
     contribute(CkCallback(CkReductionTarget(CkMemCheckPT,startChkp),thisProxy[0]));
@@ -797,7 +803,7 @@ void CkMemCheckPT::startCheckpoint(){
           CmiSetHandler(env,recoverRemoteProcDataHandlerIdx);
           CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
           if(CkMyPe() == CpvAccess(_remoteCrashedNode)){
           CmiSetHandler(env,recoverRemoteProcDataHandlerIdx);
           CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
           if(CkMyPe() == CpvAccess(_remoteCrashedNode)){
-            CkPrintf("[%d] sendProcdata\n",CkMyPe());
+            CkPrintf("[%d] sendProcdata at %lf\n",CkMyPe(),CmiWallTimer());
           }
         }
         //send the array checkpoint data
           }
         }
         //send the array checkpoint data
@@ -808,7 +814,7 @@ void CkMemCheckPT::startCheckpoint(){
         CmiSetHandler(env,recoverRemoteArrayDataHandlerIdx);
         CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
         if(CkMyPe() == CpvAccess(_remoteCrashedNode))
         CmiSetHandler(env,recoverRemoteArrayDataHandlerIdx);
         CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
         if(CkMyPe() == CpvAccess(_remoteCrashedNode))
-          CkPrintf("[%d] sendArraydata\n",CkMyPe());
+          CkPrintf("[%d] sendArraydata at %lf\n",CkMyPe(),CmiWallTimer());
       }else{
         if(CkMyPe() == CpvAccess(_remoteCrashedNode)){
           int pointer = CpvAccess(curPointer);
       }else{
         if(CkMyPe() == CpvAccess(_remoteCrashedNode)){
           int pointer = CpvAccess(curPointer);
@@ -850,20 +856,20 @@ void CkMemCheckPT::doneRComparison(int ret){
     }else{
       CmiPrintf("[%d][%d] Local checkpoint finished in %f seconds at %lf, waiting for replica ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime,CmiWallTimer());
     }
     }else{
       CmiPrintf("[%d][%d] Local checkpoint finished in %f seconds at %lf, waiting for replica ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime,CmiWallTimer());
     }
-    if(notifyReplica == 0){
-      //notify the replica am done
-      char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
-      *(int *)(msg+CmiMsgHeaderSizeBytes) = ret;
-      CmiSetHandler(msg,replicaChkpDoneHandlerIdx);
-      CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(int),(char *)msg);
-      notifyReplica = 1;
-    }
- // }
- /* else{
+  //}
+  /*else{
     CkPrintf("[%d][%d] going to RollBack %d at %lf checkpoint in %lf\n", CmiMyPartition(),CkMyPe(),ret,CmiWallTimer(), CmiWallTimer()-startTime);
     startTime = CmiWallTimer();
     thisProxy.RollBack();
   }*/
     CkPrintf("[%d][%d] going to RollBack %d at %lf checkpoint in %lf\n", CmiMyPartition(),CkMyPe(),ret,CmiWallTimer(), CmiWallTimer()-startTime);
     startTime = CmiWallTimer();
     thisProxy.RollBack();
   }*/
+  if(notifyReplica == 0){
+    //notify the replica am done
+    char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
+    *(int *)(msg+CmiMsgHeaderSizeBytes) = ret;
+    CmiSetHandler(msg,replicaChkpDoneHandlerIdx);
+    CmiRemoteSyncSendAndFree(0,CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(int),(char *)msg);
+    notifyReplica = 1;
+  }
 }
 
 void CkMemCheckPT::doneBothComparison(){
 }
 
 void CkMemCheckPT::doneBothComparison(){
@@ -890,7 +896,7 @@ void CkMemCheckPT::doneBothComparison(){
 void CkMemCheckPT::RollBack(){
   //restore group data
   checkpointed = 0;
 void CkMemCheckPT::RollBack(){
   //restore group data
   checkpointed = 0;
-  CkMemCheckPT::inRestarting = 1;
+  inRestarting = 1;
   int pointer = CpvAccess(curPointer)^1;//use the previous one
   CkCheckPTMessage* chkpMsg = CpvAccess(chkpBuf)[pointer];
   PUP::fromMem p(chkpMsg->packData);   
   int pointer = CpvAccess(curPointer)^1;//use the previous one
   CkCheckPTMessage* chkpMsg = CpvAccess(chkpBuf)[pointer];
   PUP::fromMem p(chkpMsg->packData);   
@@ -1498,7 +1504,7 @@ void CkMemCheckPT::RollBack(){
       }
       CKLOCMGR_LOOP(mgr->resumeFromChkp(););
       inRestarting = 0;
       }
       CKLOCMGR_LOOP(mgr->resumeFromChkp(););
       inRestarting = 0;
-
+      maxIter = -1;
 #if CMK_CONVERSE_MPI   
       if(CmiNumPartition()!=1){
         CpvAccess(recvdProcChkp) = 0;
 #if CMK_CONVERSE_MPI   
       if(CmiNumPartition()!=1){
         CpvAccess(recvdProcChkp) = 0;
@@ -1532,10 +1538,19 @@ void CkMemCheckPT::RollBack(){
     void CkMemCheckPT::recoverFromSoftFailure()
     {
       inRestarting = 0;
     void CkMemCheckPT::recoverFromSoftFailure()
     {
       inRestarting = 0;
+      maxIter = -1;
       CpvAccess(recvdRemote) = 0;
       CpvAccess(recvdLocal) = 0;
       CpvAccess(localChkpDone) = 0;
       CpvAccess(remoteChkpDone) = 0;
       CpvAccess(recvdRemote) = 0;
       CpvAccess(recvdLocal) = 0;
       CpvAccess(localChkpDone) = 0;
       CpvAccess(remoteChkpDone) = 0;
+      CpvAccess(remoteReady) = 0;
+      CpvAccess(localReady) = 0;
+      inCheckpointing = 0;
+      notifyReplica = 0;
+      CpvAccess(remoteStarted) = 0;
+      CpvAccess(localStarted) = 0;
+      CpvAccess(_remoteCrashedNode) = -1;
+      CkMemCheckPT::replicaAlive = 1;
       inCheckpointing = 0;
       notifyReplica = 0;
       if(CkMyPe() == 0){
       inCheckpointing = 0;
       notifyReplica = 0;
       if(CkMyPe() == 0){
@@ -1633,7 +1648,7 @@ void CkMemCheckPT::RollBack(){
       CmiAssert(CkMyPe() == _diePE);
       count ++;
       if (count == CkNumPes()||(CpvAccess(resilience)==1&&count==1)) {
       CmiAssert(CkMyPe() == _diePE);
       count ++;
       if (count == CkNumPes()||(CpvAccess(resilience)==1&&count==1)) {
-        printf("restart begin on %d\n",CkMyPe());
+        printf("restart begin on %d at %lf\n",CkMyPe(),CmiWallTimer());
         CkRestartCheckPointCallback(NULL, NULL);
         count = 0;
       }
         CkRestartCheckPointCallback(NULL, NULL);
         count = 0;
       }
@@ -1906,10 +1921,10 @@ void CkMemCheckPT::RollBack(){
     }
     
     static void askRecoverDataHandler(char * msg){
     }
     
     static void askRecoverDataHandler(char * msg){
+      if(CmiMyPe() == CpvAccess(_remoteCrashedNode))
+       CmiPrintf("[%d][%d] receive replica phase change at %lf\n",CmiMyPartition(),CmiMyPe(),CmiWallTimer());
       if(CpvAccess(resilience)!=1){
         CpvAccess(remoteReady)=1;
       if(CpvAccess(resilience)!=1){
         CpvAccess(remoteReady)=1;
-        if(CmiMyPe() == CpvAccess(_remoteCrashedNode))
-          CmiPrintf("[%d][%d] receive replica phase change at %lf\n",CmiMyPartition(),CmiMyPe(),CmiWallTimer());
         if(CpvAccess(localReady)==1){
           if(CmiMyPe() == CpvAccess(_remoteCrashedNode))
           {    
         if(CpvAccess(localReady)==1){
           if(CmiMyPe() == CpvAccess(_remoteCrashedNode))
           {    
@@ -1917,7 +1932,7 @@ void CkMemCheckPT::RollBack(){
             CkPackMessage(&env);
             CmiSetHandler(env,recoverRemoteProcDataHandlerIdx);
             CmiRemoteSyncSendAndFree(CmiMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
             CkPackMessage(&env);
             CmiSetHandler(env,recoverRemoteProcDataHandlerIdx);
             CmiRemoteSyncSendAndFree(CmiMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
-            CmiPrintf("[%d] sendProcdata after request\n",CmiMyPe());
+            CmiPrintf("[%d] sendProcdata after request at \n",CmiMyPe(),CmiWallTimer());
           }
           //send the array checkpoint data
           envelope * env = (envelope *)(UsrToEnv(CpvAccess(recoverArrayBuf)));
           }
           //send the array checkpoint data
           envelope * env = (envelope *)(UsrToEnv(CpvAccess(recoverArrayBuf)));
@@ -1936,7 +1951,7 @@ void CkMemCheckPT::RollBack(){
         CkPackMessage(&env);
         CmiSetHandler(env,recoverRemoteProcDataHandlerIdx);
         CmiRemoteSyncSendAndFree(CmiMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
         CkPackMessage(&env);
         CmiSetHandler(env,recoverRemoteProcDataHandlerIdx);
         CmiRemoteSyncSendAndFree(CmiMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
-        CmiPrintf("[%d] sendProcdata after request\n",CmiMyPe());
+        CmiPrintf("[%d] sendProcdata after request at %lf\n",CmiMyPe(),CmiWallTimer());
         
         CkCheckPTMessage * arrayMsg = (CkCheckPTMessage *)CkCopyMsg((void **)&CpvAccess(chkpBuf)[pointer]);
         arrayMsg->pointer = pointer;
         
         CkCheckPTMessage * arrayMsg = (CkCheckPTMessage *)CkCopyMsg((void **)&CpvAccess(chkpBuf)[pointer]);
         arrayMsg->pointer = pointer;
@@ -2250,7 +2265,7 @@ void CkMemCheckPT::RollBack(){
     {
 #if CMK_MEM_CHECKPOINT
       double now = CmiWallTimer();
     {
 #if CMK_MEM_CHECKPOINT
       double now = CmiWallTimer();
-      if (lastPingTime > 0 && now - lastPingTime > 4 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
+      if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
         //if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb()) {
         int i, pe, buddy;
         // tell everyone the buddy dies
         //if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb()) {
         int i, pe, buddy;
         // tell everyone the buddy dies
index b392f33cac9258efeed28a3b13653c40f4b9ec13..7db2ccca91fcc49d33aea6f5545aee0de03d47ee 100644 (file)
@@ -1158,7 +1158,8 @@ double CmiWallTimer()
 {
   unsigned long long currenttime;
   currenttime = rts_get_timebase();
 {
   unsigned long long currenttime;
   currenttime = rts_get_timebase();
-  return CpvAccess(clocktick)*(currenttime-inittime_wallclock);
+  //return CpvAccess(clocktick)*(currenttime-inittime_wallclock);
+  return CpvAccess(clocktick)*(currenttime);
 }
 
 double CmiCpuTimer()
 }
 
 double CmiCpuTimer()