detect failure after the first checkpoint,put vector into array first when pup if... replica_ft
authorXiang Ni <xiangni2@illinois.edu>
Sat, 13 Apr 2013 02:59:55 +0000 (02:59 +0000)
committerXiang Ni <xiangni2@illinois.edu>
Sat, 13 Apr 2013 02:59:55 +0000 (02:59 +0000)
src/arch/mpi/machine.c
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.h
src/util/pup_stl.h
src/util/pup_util.C

index 6947317797c781a4f8964082e03a2526517c9a6a..0090fe3da9f6bf54e2a05b3ccf381fdda44db8a1 100644 (file)
@@ -1728,6 +1728,7 @@ void LrtsAbort(const char *message) {
 #endif
     CmiError("------------- Processor %d Exiting: Called CmiAbort ------------\n"
              "Reason: %s\n",CmiMyPe(),message);
+    exit(1);
     /*  CmiError(message); */
     CmiPrintStackTrace(0);
     m = CmiAlloc(CmiMsgHeaderSizeBytes);
index 715bdbe758c1851264ac08650e03a4786d7a003a..296dcd206da0722ff412f6196f0c9fb26ad24611 100644 (file)
@@ -80,6 +80,7 @@ CpvDeclare(int, _remoteCrashedNode);
 // static, so that it is accessible from Converse part
 int CkMemCheckPT::inRestarting = 0;
 int CkMemCheckPT::inCheckpointing = 0;
+int CkMemCheckPT::detectionOn=-1;
 int CkMemCheckPT::aboutToDie = 0;
 int CkMemCheckPT::replicaAlive = 1;
 int CkMemCheckPT::inLoadbalancing = 0;
@@ -402,11 +403,12 @@ CkMemCheckPT::CkMemCheckPT(int w)
   where = w;
   replicaAlive = 1;
   notifyReplica = 0;
+  chkpCount = 0;
 #if CMK_CONVERSE_MPI
   void pingBuddy();
   void pingCheckHandler();
-  CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
-  CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+  //CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+  //CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
 #endif
   chkpTable[0] = NULL;
   chkpTable[1] = NULL;
@@ -511,6 +513,7 @@ void CkMemCheckPT::pup(PUP::er& p)
   p|peCount;
   p|localSeed;
   p|softLocalSeed;
+  p|chkpCount;
   if (p.isUnpacking()) {
     recvCount = 0;
 #if CMK_CONVERSE_MPI
@@ -990,6 +993,17 @@ void CkMemCheckPT::doneBothComparison(){
   CpvAccess(curPointer)^=1;
   inCheckpointing = 0;
   notifyReplica = 0;
+  
+  chkpCount++;  
+  
+  if(detectionOn == -1){
+    detectionOn = 1;
+    void pingBuddy();
+    void pingCheckHandler();
+    CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
+    CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+  }
+  
   if(CkMyPe() == 0){
     CmiPrintf("[%d][%d] Checkpoint finished in %f seconds at %lf, checkpoint size %d, memory usage %lf sending callback ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime,CmiWallTimer(),size, CmiMemoryUsage()/1048576.0);
   }
@@ -1640,6 +1654,9 @@ void CkMemCheckPT::RollBack(){
           CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,CmiMsgHeaderSizeBytes,(char *)msg);
         }
       }
+      
+      detectionOn = -1;
+      
       if (CmiMyPe() == BuddyPE(thisFailedPe)) {
         lastPingTime = CmiWallTimer();
         CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
@@ -2448,7 +2465,7 @@ void CkMemCheckPT::RollBack(){
 #if CMK_MEM_CHECKPOINT
       // notify
       CkMemCheckPT::inRestarting = 1;
-      CkMemCheckPT::aboutToDie =  1;
+      CkMemCheckPT::aboutToDie =  0;
       int diepe = *(int *)(msg+CmiMsgHeaderSizeBytes);
       notify_crash(diepe);
       // send message to crash pe to let it restart
@@ -2472,7 +2489,7 @@ void CkMemCheckPT::RollBack(){
     {
 #if CMK_MEM_CHECKPOINT
       double now = CmiWallTimer();
-      if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
+      if (lastPingTime > 0 && now - lastPingTime > 6 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
         //if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb()) {
         int i, pe, buddy;
         // tell everyone the buddy dies
@@ -2496,8 +2513,10 @@ void CkMemCheckPT::RollBack(){
           CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(int),(char *)rMsg);
         }
       }
-        else 
-          CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+        else{ 
+          if(CkMemCheckPT::detectionOn == 1)
+           CcdCallOnCondition(CcdPERIODIC_1s,(CcdVoidFn)pingCheckHandler,NULL);
+       }
 #endif
       }
 
index 3724f0b015421ce5b7d30c34029419e762abb614..fb4d4ae2e443fecc93156ec8c66f9ff1d37dc655 100644 (file)
@@ -125,6 +125,7 @@ public:
   static int inLoadbalancing;
   static int replicaAlive;
   static double startTime;
+  static int detectionOn;
   static char*  stage;
   
   bool inProgress;
index e7874062a7ca17b756eab54b78ba0554d60ef62d..f1e0efb9e69f55b7cd29b9c9b05a0150e3782c5d 100644 (file)
@@ -97,6 +97,20 @@ inline int PUP_stl_container_size(PUP::er &p,container &c) {
   p|nElem;
   return nElem; 
 }
+//Impl. util: pup each current item of a container (no allocation)
+template <class container>
+inline void PUP_stl_container_items(PUP::er &p,container &c) {
+  double _c[c.size()];
+  int i=0;
+  for (typename container::iterator it=c.begin();
+       it!=c.end();
+       ++it) {
+    p.syncComment(PUP::sync_item);
+    _c[i]=*(double *)&(*it);
+    i++;
+  }
+  p(_c,i);
+}
 
 //Impl. util: pup each current item of a container (no allocation)
 template <class container, class dtype>
@@ -110,6 +124,24 @@ inline void PUP_stl_container_items(PUP::er &p,container &c) {
   }
 }
 
+template <class container>
+inline void PUP_stl_container(PUP::er &p,container &c) {
+  p.syncComment(PUP::sync_begin_array);
+  int nElem=PUP_stl_container_size(p,c);
+  if (p.isUnpacking()) 
+  { //Unpacking: Extract each element and push_back:
+    c.resize(0);
+    for (int i=0;i<nElem;i++) {
+      p.syncComment(PUP::sync_item);
+      double n;
+      p|n;
+      c.push_back(n);
+    } 
+  }
+  else PUP_stl_container_items<container>(p,c);
+  p.syncComment(PUP::sync_end_array);
+}
+
 template <class container,class dtype>
 inline void PUP_stl_container(PUP::er &p,container &c) {
   p.syncComment(PUP::sync_begin_array);
@@ -127,6 +159,7 @@ inline void PUP_stl_container(PUP::er &p,container &c) {
   else PUP_stl_container_items<container, dtype>(p,c);
   p.syncComment(PUP::sync_end_array);
 }
+
 //Map objects don't have a "push_back", while vector and list
 //  don't have an "insert", so PUP_stl_map isn't PUP_stl_container
 template <class container,class dtype>
@@ -145,9 +178,14 @@ inline void PUP_stl_map(PUP::er &p,container &c) {
   p.syncComment(PUP::sync_end_list);
 }
 
+template <class T> 
+inline void operator|(PUP::er &p,typename std::vector<double> &v)
+  { PUP_stl_container<std::vector<double>>(p,v); }
+
 template <class T> 
 inline void operator|(PUP::er &p,typename std::vector<T> &v)
   { PUP_stl_container<std::vector<T>,T>(p,v); }
+
 template <class T> 
 inline void operator|(PUP::er &p,typename std::list<T> &v)
   { PUP_stl_container<std::list<T>,T>(p,v); }
index 39c8416f44c3e5952e77e0b25dc8a50d665f502f..571e440d3eb4a1e3ffcdd9952872b1303718ef50 100644 (file)
@@ -149,12 +149,13 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
         case Tdouble:
           {
             double * p1;
-            //double * p2;
+            double * p2;
             p1 = (double*)p;
-            //p2 = new double[n/itemSize];
-            //memcpy((char *)p2,(const void *)buf,n); 
+            p2 = new double[n/itemSize];
+            memcpy((char *)p2,(const void *)buf,n); 
             for(int i=0;i<n/itemSize;i++){
-              if(fabs(p1[i]-*(double *)((char *)buf+i*itemSize))>accuracy){
+              //if(fabs(p1[i]-*((double *)buf+i))>accuracy){
+              if(fabs(p1[i]-p2[i])>accuracy){
     //            if(result){
                   //printf("found incorrect double %e %e diff %e\n",p1[i],p2[i],(p1[i]-p2[i]));
     //            }     
@@ -162,43 +163,45 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
            //     fault_bytes++;
               }
             }
-            //delete p2;
+            delete p2;
           }    
           break;       
         case Tint:
           {
             int * p1;
-            //int * p2;
+            int * p2;
             p1 = (int *)p;
-            //p2 = new int[n/itemSize];
-           // memcpy((char *)p2,(const void *)buf,n); 
+            p2 = new int[n/itemSize];
+            memcpy((char *)p2,(const void *)buf,n); 
             for(int i=0;i<n/itemSize;i++){
-              if(fabs(p1[i]-*(int *)((char *)buf+i*itemSize))>accuracy){
+              //if(fabs(p1[i]-*(int *)((char *)buf+i*itemSize))>accuracy){
+              if(fabs(p1[i]-p2[i])>accuracy){
   //              if(result)
   //                printf("found incorrect int %d %d at %d total %d\n",p1[i],p2[i],i,n/itemSize);
                 result = result && false;
          //       fault_bytes++;
               }
             }
-            //delete p2;
+            delete p2;
           }
           break;
         case Tchar:
           {
             char * p1;
-            //char * p2;
+            char * p2;
             p1 = (char *)p;
-            //p2 = new char[n/itemSize];
-            //memcpy((char *)p2,(const void *)buf,n); 
+            p2 = new char[n/itemSize];
+            memcpy((char *)p2,(const void *)buf,n); 
             for(int i=0;i<n/itemSize;i++){
-              if(fabs(p1[i]-*(char *)((char *)buf+i*itemSize))>accuracy){
+              //if(fabs(p1[i]-*(char *)((char *)buf+i*itemSize))>accuracy){
+              if(fabs(p1[i]-p2[i])>accuracy){
    //             if(result)
    //               printf("found incorrect char %d %d at %d, total %d\n",p1[i],p2[i],i,n/itemSize);
                 result = result && false;
         //        fault_bytes++;
               }
             }
-            //delete p2;
+            delete p2;
           }
           break;
         default: