recover from soft failure
authorXiang Ni <xiangni2@illinois.edu>
Sun, 9 Dec 2012 05:16:28 +0000 (23:16 -0600)
committerXiang Ni <xiangni2@illinois.edu>
Sun, 9 Dec 2012 05:16:28 +0000 (23:16 -0600)
src/ck-core/ckarray.C
src/ck-core/ckcheckpoint.C
src/ck-core/ckcheckpoint.h
src/ck-core/cklocation.C
src/ck-core/cklocation.h
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.ci
src/ck-core/ckmemcheckpoint.h
src/ck-core/ckreduction.C

index 3d929e4db1faff9a9b6591956d26e4c5ce20d1da..a659bd102dbf23621dfe0b1272fed8b8c7cfb79e 100644 (file)
@@ -86,7 +86,7 @@ bool _isNotifyChildInRed;
 #   define DEBM(X) /*CkPrintf x*/
 #   define DEBL(X) /*CkPrintf x*/
 #   define DEBK(x) /*CkPrintf x*/
-#   define DEBB(x) /*CkPrintf x*/
+#   define DEBB(x) //CkPrintf x
 #   define str(x) /**/
 #   define DEBUG(x)
 #endif
@@ -1310,11 +1310,12 @@ void CProxy_ArrayBase::ckBroadcast(CkArrayMessage *msg, int ep, int opts) const
                 CProxy_CkArray ap(_aid);
                 ap[CpvAccess(serializer)].sendBroadcast(msg);
                 CkGroupID _id = _aid;
-//              printf("[%d] At ckBroadcast in CProxy_ArrayBase id %d epidx %d \n",CkMyPe(),_id.idx,ep);
+                      printf("[%d] At ckBroadcast in CProxy_ArrayBase id %d epidx %d \n",CkMyPe(),_id.idx,ep);
 #else
          if (CkMyPe()==CpvAccess(serializer))
          {
                DEBB((AA"Sending array broadcast\n"AB));
+               printf("Sending array broadcast\n");
                if (skipsched)
                        CProxy_CkArray(_aid).recvExpeditedBroadcast(msg);
                else
index 97be7d3b29910863308d592c381ba77184003270..0bc2844f9dce4644ff27bccb3a5c2d0a7183989e 100644 (file)
@@ -127,11 +127,7 @@ void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
        FILE* fGroups = CmiFopen(fileName,"wb");
        if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
        PUP::toDisk pGroups(fGroups);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
     CkPupGroupData(pGroups,CmiTrue);
-#else
-    CkPupGroupData(pGroups);
-#endif
        CmiFclose(fGroups);
 
        // save nodegroups into NodeGroups.dat
@@ -142,11 +138,7 @@ void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
          if(!fNodeGroups) 
            CkAbort("Failed to create checkpoint file for nodegroup table!");
          PUP::toDisk pNodeGroups(fNodeGroups);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
       CkPupNodeGroupData(pNodeGroups,CmiTrue);
-#else
-      CkPupNodeGroupData(pNodeGroups);
-#endif
          CmiFclose(fNodeGroups);
        }
 
@@ -275,8 +267,6 @@ void CkPupChareData(PUP::er &p)
 }
 #endif
 
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
-// handle GroupTable and data
 void CkPupGroupData(PUP::er &p, CmiBool create)
 {
        int numGroups, i;
@@ -301,7 +291,6 @@ void CkPupGroupData(PUP::er &p, CmiBool create)
                tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
                tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
                strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
-               //CkPrintf("[%d] CkPupGroupData: %s group %s \n", CkMyPe(), p.typeString(), tmpInfo[i].name);
 
                if(tmpInfo[i].MigCtor==-1) {
                        char buf[512];
@@ -329,10 +318,11 @@ void CkPupGroupData(PUP::er &p, CmiBool create)
          }   // end of unPacking
          IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
          // if using migration constructor, you'd better have a pup
+#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
                if(!create)
                        gobj->mlogData->teamRecoveryFlag = 1;
+#endif
           gobj->pup(p);
-         // CkPrintf("Group PUP'ed: gid = %d, name = %s\n",gobj->ckGetGroupID().idx, tmpInfo[i].name);
        }
        delete [] tmpInfo;
 }
@@ -383,116 +373,6 @@ void CkPupNodeGroupData(PUP::er &p, CmiBool create)
        }
        delete [] tmpInfo;
 }
-#else
-// handle GroupTable and data
-void CkPupGroupData(PUP::er &p)
-{
-       int numGroups, i;
-
-       if (!p.isUnpacking()) {
-         numGroups = CkpvAccess(_groupIDTable)->size();
-       }
-       p|numGroups;
-       if (p.isUnpacking()) {
-         if(CkMyPe()==0)  
-            CkpvAccess(_numGroups) = numGroups+1; 
-          else 
-           CkpvAccess(_numGroups) = 1;
-       }
-       DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
-
-       GroupInfo *tmpInfo = new GroupInfo [numGroups];
-       if (!p.isUnpacking()) {
-         for(i=0;i<numGroups;i++) {
-               tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
-               TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
-               tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
-               tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
-               strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
-               DEBCHK("[%d] CkPupGroupData: %s group %s \n",
-                       CkMyPe(), p.typeString(), tmpInfo[i].name);
-
-               if(tmpInfo[i].MigCtor==-1) {
-                       char buf[512];
-                       sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
-                       CkAbort(buf);
-               }
-         }
-       }
-       for (i=0; i<numGroups; i++) p|tmpInfo[i];
-
-       for(i=0;i<numGroups;i++) 
-       {
-         CkGroupID gID = tmpInfo[i].gID;
-         if (p.isUnpacking()) {
-           //CkpvAccess(_groupIDTable)->push_back(gID);
-           int eIdx = tmpInfo[i].MigCtor;
-           // error checking
-           if (eIdx == -1) {
-             CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
-           }
-           void *m = CkAllocSysMsg();
-           envelope* env = UsrToEnv((CkMessage *)m);
-           CkCreateLocalGroup(gID, eIdx, env);
-         }   // end of unPacking
-         IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
-         // if using migration constructor, you'd better have a pup
-          gobj->pup(p);
-          DEBCHK("Group PUP'ed: gid = %d, name = %s\n",
-                       gobj->ckGetGroupID().idx, tmpInfo[i].name);
-       }
-       delete [] tmpInfo;
-}
-
-// handle NodeGroupTable and data
-void CkPupNodeGroupData(PUP::er &p)
-{
-       int numNodeGroups, i;
-       if (!p.isUnpacking()) {
-         numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
-       }
-       p|numNodeGroups;
-       if (p.isUnpacking()) {
-         if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
-         else { CksvAccess(_numNodeGroups) = 1; }
-       }
-       DEBCHK("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
-
-       GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
-       if (!p.isUnpacking()) {
-         for(i=0;i<numNodeGroups;i++) {
-               tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
-               TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
-               tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
-               if(tmpInfo[i].MigCtor==-1) {
-                       char buf[512];
-                       sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
-                                    declared as [migratable] in .ci to be able to checkpoint.",\
-                                    _chareTable[ent2.getcIdx()]->name);
-                       CkAbort(buf);
-               }
-         }
-       }
-       for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
-       for (i=0;i<numNodeGroups;i++) {
-               CkGroupID gID = tmpInfo[i].gID;
-               if (p.isUnpacking()) {
-                       //CksvAccess(_nodeGroupIDTable).push_back(gID);
-                       int eIdx = tmpInfo[i].MigCtor;
-                       void *m = CkAllocSysMsg();
-                       envelope* env = UsrToEnv((CkMessage *)m);
-                       CkCreateLocalNodeGroup(gID, eIdx, env);
-               }
-               TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
-               IrrGroup *obj = ent2.getObj();
-               obj->pup(p);
-               DEBCHK("Nodegroup PUP'ed: gid = %d, name = %s\n",
-                       obj->ckGetGroupID().idx,
-                       _chareTable[ent2.getcIdx()]->name);
-       }
-       delete [] tmpInfo;
-}
-#endif
 
 // handle chare array elements for this processor
 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
@@ -569,19 +449,11 @@ void CkPupProcessorData(PUP::er &p)
     CkPupChareData(p);
 
     // save groups 
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
-    CkPupGroupData(p,CmiTrue);
-#else
-    CkPupGroupData(p);
-#endif
+       CkPupGroupData(p,CmiTrue);
 
     // save nodegroups
     if(CkMyRank()==0) {
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
         CkPupNodeGroupData(p,CmiTrue); 
-#else
-        CkPupNodeGroupData(p);
-#endif
     }
 
     // pup array elements
@@ -718,11 +590,7 @@ void CkRestartMain(const char* dirname, CkArgMsg *args){
        FILE* fGroups = CmiFopen(filename,"rb");
        if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
        PUP::fromDisk pGroups(fGroups);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
     CkPupGroupData(pGroups,CmiTrue);
-#else
-    CkPupGroupData(pGroups);
-#endif
        CmiFclose(fGroups);
 
        // restore nodegroups
@@ -735,11 +603,7 @@ void CkRestartMain(const char* dirname, CkArgMsg *args){
                FILE* fNodeGroups = CmiFopen(filename,"rb");
                if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
                PUP::fromDisk pNodeGroups(fNodeGroups);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
         CkPupNodeGroupData(pNodeGroups,CmiTrue);
-#else
-        CkPupNodeGroupData(pNodeGroups);
-#endif
                CmiFclose(fNodeGroups);
        }
 
index 6f210777d8738e227d5e735d6c353f45fac1e267..0af58de89b19af8b5aecd0b1c0d38da26a430f0b 100644 (file)
@@ -60,13 +60,8 @@ public:
 void CkPupROData(PUP::er &p);
 void CkPupMainChareData(PUP::er &p, CkArgMsg *args);
 void CkPupChareData(PUP::er &p);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
 void CkPupGroupData(PUP::er &p,CmiBool create=CmiTrue);
 void CkPupNodeGroupData(PUP::er &p,CmiBool create=CmiTrue);
-#else
-void CkPupGroupData(PUP::er &p);
-void CkPupNodeGroupData(PUP::er &p);
-#endif
 void CkPupArrayElementsData(PUP::er &p, int notifyListeners=1);
 void CkPupProcessorData(PUP::er &p);
 void CkRemoveArrayElements();
index b0211262f3e6e2ab1fab0e87d10738fac1e93631..06ad7c5aeea67ee995316d4f7750ad288aaa264b 100644 (file)
@@ -2325,12 +2325,12 @@ CkLocRec_local *CkLocMgr::createLocal(const CkArrayIndex &idx,
                CmiBool forMigration, CmiBool ignoreArrival,
                CmiBool notifyHome)
 {
+       //CkPrintf("Adding new record for element %s\n",idx2str(idx));
        int localIdx=nextFree();
        DEBC((AA"Adding new record for element %s at local index %d\n"AB,idx2str(idx),localIdx));
+       //CkPrintf("Adding new record for element %s at local index %d\n",idx2str(idx),localIdx);
        CkLocRec_local *rec=new CkLocRec_local(this,forMigration,ignoreArrival,idx,localIdx);
        insertRec(rec,idx); //Add to global hashtable
-
-
        if (notifyHome) informHome(idx,CkMyPe());
        return rec;
 }
@@ -2723,7 +2723,6 @@ void CkLocMgr::pupElementsFor(PUP::er &p,CkLocRec_local *rec,
     register ManagerRec *m;
     int localIdx=rec->getLocalIndex();
     CkVec<CkMigratable *> dummyElts;
-
     for (m=firstManager;m!=NULL;m=m->next) {
         int elCType;
         if (!p.isUnpacking())
@@ -2736,8 +2735,8 @@ void CkLocMgr::pupElementsFor(PUP::er &p,CkLocRec_local *rec,
         if (p.isUnpacking() && elCType!=-1) {
             CkMigratable *elt=m->mgr->allocateMigrated(elCType,rec->getIndex(),type);
             int migCtorIdx=_chareTable[elCType]->getMigCtor();
-                if(!dummy){
-                       if(create)
+                       if(!dummy){
+                               if(create)
                                if (!addElementToRec(rec,m,elt,migCtorIdx,NULL)) return;
                                }else{
                     CkMigratable_initInfo &i=CkpvAccess(mig_initInfo);
@@ -2747,7 +2746,7 @@ void CkLocMgr::pupElementsFor(PUP::er &p,CkLocRec_local *rec,
                     if (!rec->invokeEntry(elt,NULL,migCtorIdx,CmiTrue)) return ;
                 }
         }
-    }
+       }
     if(!dummy){
         for (m=firstManager;m!=NULL;m=m->next) {
             CkMigratable *elt=m->element(localIdx);
@@ -2773,7 +2772,7 @@ void CkLocMgr::pupElementsFor(PUP::er &p,CkLocRec_local *rec,
 }
 #else
 void CkLocMgr::pupElementsFor(PUP::er &p,CkLocRec_local *rec,
-               CkElementCreation_t type,CmiBool rebuild)
+               CkElementCreation_t type,CmiBool rebuild,CmiBool create)
 {
        p.comment("-------- Array Location --------");
        register ManagerRec *m;
@@ -2795,7 +2794,8 @@ void CkLocMgr::pupElementsFor(PUP::er &p,CkLocRec_local *rec,
                        CkMigratable *elt=m->mgr->allocateMigrated(elCType,rec->getIndex(),type);
                        int migCtorIdx=_chareTable[elCType]->getMigCtor();
                        //Insert into our tables and call migration constructor
-                       if (!addElementToRec(rec,m,elt,migCtorIdx,NULL)) return;
+                       if(create)
+                               if (!addElementToRec(rec,m,elt,migCtorIdx,NULL)) return;
                }
        }
        //Next pup the element data
@@ -2809,6 +2809,7 @@ void CkLocMgr::pupElementsFor(PUP::er &p,CkLocRec_local *rec,
 #endif
                 }
        }
+
 #if CMK_MEM_CHECKPOINT
        if(rebuild){
          ArrayElement *elt;
@@ -3089,13 +3090,23 @@ void CkLocMgr::resume(const CkArrayIndex &idx, PUP::er &p, CmiBool create, int d
     }
 }
 #else
-void CkLocMgr::resume(const CkArrayIndex &idx, PUP::er &p, CmiBool notify,CmiBool rebuild)
+void CkLocMgr::resume(const CkArrayIndex &idx, PUP::er &p, CmiBool notify,CmiBool rebuild,CmiBool create)
 {
-       CkLocRec_local *rec=createLocal(idx,CmiFalse,CmiFalse,notify /* home doesn't know yet */ );
-
-       //Create the new elements as we unpack the message
-       pupElementsFor(p,rec,CkElementCreation_resume,rebuild);
+       CkLocRec_local *rec;
+       CkLocRec *recGlobal;    
 
+       if(create){
+               //Create the new elements as we unpack the message
+               rec=createLocal(idx,CmiFalse,CmiFalse,notify /* home doesn't know yet */ );
+       }else{
+               recGlobal = elementNrec(idx);
+               if(recGlobal == NULL) 
+                       CmiAbort("Local object not found");
+               if(recGlobal->type() != CkLocRec::local)
+                       CmiAbort("Local object not local, :P");
+               rec = (CkLocRec_local *)recGlobal;
+       }
+       pupElementsFor(p,rec,CkElementCreation_resume,rebuild,create);
        callMethod(rec,&CkMigratable::ckJustMigrated);
 }
 #endif
index c80995a12014d663c9d666e3bcb26a1e94ab2c96..43baba7fe0d8ee86fc9bf65b2d0902856ac0b6ba 100644 (file)
@@ -655,7 +655,7 @@ public:
 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
        void resume(const CkArrayIndex &idx, PUP::er &p, CmiBool create, int dummy=0);
 #else
-       void resume(const CkArrayIndex &idx, PUP::er &p, CmiBool notify=CmiTrue,CmiBool=CmiFalse);
+       void resume(const CkArrayIndex &idx, PUP::er &p, CmiBool notify=CmiTrue,CmiBool rebuild=CmiFalse,CmiBool create = CmiTrue);
 #endif
 
 //Communication:
@@ -697,9 +697,8 @@ private:
         CkElementCreation_t type, CmiBool create=CmiTrue, int dummy=0);
 #else
        void pupElementsFor(PUP::er &p,CkLocRec_local *rec,
-               CkElementCreation_t type,CmiBool rebuild = CmiFalse);
+        CkElementCreation_t type,CmiBool rebuild = CmiFalse, CmiBool create=CmiTrue);
 #endif
-
        /// Call this member function on each element of this location:
        typedef void (CkMigratable::* CkMigratable_voidfn_t)(void);
 
index c390fa45a6d3b938844259f534463aa9be3d2d95..f6e24567f70d4faf04d35be087440985bad6df9a 100644 (file)
@@ -115,6 +115,7 @@ double killTime=0.0;
 /// checkpoint buffer for processor system data, remove static to make icpc 10.1 pass with -O
 CpvDeclare(CkProcCheckPTMessage*, procChkptBuf);
 CpvDeclare(CkCheckPTMessage**, chkpBuf);
+CpvDeclare(CkCheckPTMessage**, localProcChkpBuf);
 //store the checkpoint of the buddy to compare
 //do not need the whole msg, can be the checksum
 CpvDeclare(CkCheckPTMessage*, buddyBuf);
@@ -124,7 +125,7 @@ CpvDeclare(int, recvdRemote);
 CpvDeclare(int, recvdLocal);
 
 bool compare(char * buf1, char * buf2);
-static inline void _handleProcData(PUP::er &p);
+static inline void _handleProcData(PUP::er &p,CmiBool create= CmiTrue);
 // Converse function handles
 static int askPhaseHandlerIdx;
 static int recvPhaseHandlerIdx;
@@ -497,6 +498,11 @@ void CkMemCheckPT::doItNow(int starter, CkCallback &cb)
     startTime = CmiWallTimer();
     CkPrintf("[%d] Start checkpointing  starter: %d... \n", CkMyPe(), cpStarter);
   }
+#if CMK_CONVERSE_MPI
+  if(CmiNumPartition()==1)
+#endif   
+  {
+
 #if !CMK_CHKP_ALL
   int len = ckTable.length();
   for (int i=0; i<len; i++) {
@@ -513,11 +519,17 @@ void CkMemCheckPT::doItNow(int starter, CkCallback &cb)
     // if my table is empty, then I am done
   if (len == 0) contribute(CkCallback(CkReductionTarget(CkMemCheckPT, cpFinish), thisProxy[cpStarter]));
 #else
-  startCheckpoint();
-  //startArrayCheckpoint();
+  startArrayCheckpoint();
+#endif
+       sendProcData();
+  }
+#if CMK_CONVERSE_MPI
+  else
+  {
+       startCheckpoint();
+  }
 #endif
   // pack and send proc level data
-  //sendProcData();
 }
 
 class MemElementPacker : public CkLocIterator{
@@ -529,9 +541,6 @@ class MemElementPacker : public CkLocIterator{
                void addLocation(CkLocation &loc){
                        CkArrayIndexMax idx = loc.getIndex();
                        CkGroupID gID = locMgr->ckGetGroupID();
-                       ArrayElement *elt = (ArrayElement *)loc.getLocalRecord();
-                       CmiAssert(elt);
-                       //elt = (ArrayElement *)locMgr->lookup(idx, aid);
                        p|gID;
                        p|idx;
                        locMgr->pupElementsFor(p,loc.getLocalRecord(),CkElementCreation_migrate);
@@ -580,28 +589,39 @@ void CkMemCheckPT::startArrayCheckpoint(){
 }
 
 void CkMemCheckPT::startCheckpoint(){
-#if CMK_CHKP_ALL
-       int size;
+#if CMK_CONVERSE_MPI
+  int size;
+  {
+    PUP::sizer p;
+    _handleProcData(p,CmiFalse);
+    size = p.size();
+  }
+       int packSize = size/sizeof(double)+1;
+       CkCheckPTMessage * procMsg = new (packSize,0) CkCheckPTMessage;
+       procMsg->len = size;
+       procMsg->cp_flag = 1;
+       {
+               PUP::toMem p(procMsg->packData);
+               _handleProcData(p,CmiFalse);
+       }
+       int pointer = CpvAccess(curPointer);
+       if(CpvAccess(localProcChkpBuf)[pointer]) delete CpvAccess(localProcChkpBuf)[pointer];
+               CpvAccess(localProcChkpBuf)[pointer] = procMsg;
+       
        {
                PUP::sizer psizer;
                pupAllElements(psizer);
-               _handleProcData(psizer);
                size = psizer.size();
        }
-       int packSize = size/sizeof(double)+1;
- // CkPrintf("[%d]checkpoint size :%d\n",CkMyPe(),packSize);
+       packSize = size/sizeof(double)+1;
        CkCheckPTMessage * msg = new (packSize,0) CkCheckPTMessage;
        msg->len = size;
        msg->cp_flag = 1;
-       int budPEs[2];
-       //msg->bud1=CkMyPe();
-       //msg->bud2=ChkptOnPe(CkMyPe());
        {
                PUP::toMem p(msg->packData);
                pupAllElements(p);
-               _handleProcData(p);
        }
-       int pointer = CpvAccess(curPointer);
+       pointer = CpvAccess(curPointer);
        if(CpvAccess(chkpBuf)[pointer]) delete CpvAccess(chkpBuf)[pointer];
                CpvAccess(chkpBuf)[pointer] = (CkCheckPTMessage *)CkCopyMsg((void **)&msg);
        CpvAccess(recvdLocal) = 1;
@@ -610,7 +630,6 @@ void CkMemCheckPT::startCheckpoint(){
        CkPackMessage(&env);
        CmiSetHandler(env,recvRemoteChkpHandlerIdx);
        CmiRemoteSyncSendAndFree(CkMyPe(),CmiMyPartition()^1,env->getTotalsize(),(char *)env);
-       
        if(CpvAccess(recvdRemote)==1){
                //compare the checkpoint 
          int size = CpvAccess(chkpBuf)[pointer]->len;
@@ -624,19 +643,60 @@ void CkMemCheckPT::startCheckpoint(){
 }
 
 void CkMemCheckPT::doneComparison(bool ret){
-       CkCallback cb(CkReductionTarget(CkMemCheckPT,doneRComparison),thisProxy[0]);
+       CkCallback cb(CkReductionTarget(CkMemCheckPT,doneRComparison),thisProxy);
        contribute(sizeof(bool),&ret,CkReduction::logical_and,cb);
 }
 
 void CkMemCheckPT::doneRComparison(bool ret){
+       CpvAccess(recvdRemote) = 0;
+       CpvAccess(recvdLocal) = 0;
+//     if(CpvAccess(curPointer) == 0){
        if(ret==true){
-       CpvAccess(recvdRemote) = 0;
-       CpvAccess(recvdLocal) = 0;
        CpvAccess(curPointer)^=1;
-               cpCallback.send();
+               if(CkMyPe() == 0){
+                       cpCallback.send();
+               }
        }else{
+               RollBack();
        }
+}
 
+void CkMemCheckPT::RollBack(){
+       //restore group data
+       CkMemCheckPT::inRestarting = 1;
+       int pointer = CpvAccess(curPointer)^1;//use the previous one
+    CkCheckPTMessage* chkpMsg = CpvAccess(chkpBuf)[pointer];
+       PUP::fromMem p(chkpMsg->packData);      
+       //_handleProcData(p);
+       
+       //destroy array elements
+       //CKLOCMGR_LOOP(mgr->flushLocalRecs(););
+         int numGroups = CkpvAccess(_groupIDTable)->size();
+         for(int i=0;i<numGroups;i++) {
+               CkGroupID gID = (*CkpvAccess(_groupIDTable))[i];
+               IrrGroup *obj = CkpvAccess(_groupTable)->find(gID).getObj();
+               obj->flushStates();
+               obj->ckJustMigrated();
+         }
+       //restore array elements
+       
+       int numElements;
+       p|numElements;
+       
+       if(p.isUnpacking()){
+               for(int i=0;i<numElements;i++){
+               //for(int i=0;i<1;i++){
+                       CkGroupID gID;
+                       CkArrayIndex idx;
+                       p|gID;
+                       p|idx;
+                       CkLocMgr * mgr = (CkLocMgr *)CkpvAccess(_groupTable)->find(gID).getObj();
+                       CmiAssert(mgr);
+                       mgr->resume(idx,p,CmiFalse,CmiTrue,CmiFalse);
+               }
+       }
+       CkCallback cb(CkReductionTarget(CkMemCheckPT,recoverFromSoftFailure),thisProxy);
+       contribute(cb);
 }
 
 void CkMemCheckPT::recvArrayCheckpoint(CkArrayCheckPTMessage *msg)
@@ -671,7 +731,7 @@ void CkMemCheckPT::recvArrayCheckpoint(CkArrayCheckPTMessage *msg)
 }
 
 // don't handle array elements
-static inline void _handleProcData(PUP::er &p)
+static inline void _handleProcData(PUP::er &p, CmiBool create)
 {
     // save readonlys, and callback BTW
     CkPupROData(p);
@@ -685,10 +745,10 @@ static inline void _handleProcData(PUP::er &p)
 #endif
 
     // save groups into Groups.dat
-    CkPupGroupData(p);
+    CkPupGroupData(p,create);
 
     // save nodegroups into NodeGroups.dat
-    if(CkMyRank()==0) CkPupNodeGroupData(p);
+    if(CkMyRank()==0) CkPupNodeGroupData(p,create);
 }
 
 void CkMemCheckPT::sendProcData()
@@ -697,7 +757,7 @@ void CkMemCheckPT::sendProcData()
   int size;
   {
     PUP::sizer p;
-    _handleProcData(p);
+    _handleProcData(p,CmiTrue);
     size = p.size();
   }
   int packSize = size;
@@ -705,7 +765,7 @@ void CkMemCheckPT::sendProcData()
   DEBUGF("[%d] CkMemCheckPT::sendProcData - size: %d to %d\n", CkMyPe(), size, ChkptOnPe(CkMyPe()));
   {
     PUP::toMem p(msg->packData);
-    _handleProcData(p);
+    _handleProcData(p,CmiTrue);
   }
   msg->pe = CkMyPe();
   msg->len = size;
@@ -1160,6 +1220,7 @@ void CkMemCheckPT::recoverAll(CkArrayCheckPTMessage * msg,CkVec<CkGroupID> * gma
        PUP::fromMem p(msg->packData);
        int numElements;
        p|numElements;
+       CkPrintf("[%d] recover all %d\n",CkMyPe(),numElements);
        if(p.isUnpacking()){
                for(int i=0;i<numElements;i++){
                        CkGroupID gID;
@@ -1217,6 +1278,12 @@ void CkMemCheckPT::finishUp()
 #endif
 }
 
+void CkMemCheckPT::recoverFromSoftFailure()
+{
+       inRestarting = 0;
+       if(CkMyPe()==0)
+       cpCallback.send();
+}
 // called only on 0
 void CkMemCheckPT::quiescence(CkCallback &cb)
 {
@@ -1395,7 +1462,7 @@ static void recoverProcDataHandler(char *msg)
 //   int temp = cur_restart_phase;
 //   cur_restart_phase = -1;
    PUP::fromMem p(procMsg->packData);
-   _handleProcData(p);
+   _handleProcData(p,CmiTrue);
 
    CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->resetLB(CkMyPe());
    // gzheng
@@ -1703,13 +1770,17 @@ void CkRegisterRestartHandler( )
 
   CpvInitialize(CkProcCheckPTMessage *, procChkptBuf);
   CpvInitialize(CkCheckPTMessage **, chkpBuf);
+  CpvInitialize(CkCheckPTMessage **, localProcChkpBuf);
   CpvInitialize(CkCheckPTMessage *, buddyBuf);
   CpvInitialize(int,curPointer);
   CpvAccess(procChkptBuf) = NULL;
   CpvAccess(buddyBuf) = NULL;
   CpvAccess(chkpBuf) = new CkCheckPTMessage *[2];
+  CpvAccess(localProcChkpBuf) = new CkCheckPTMessage *[2];
   CpvAccess(chkpBuf)[0] = NULL;
   CpvAccess(chkpBuf)[1] = NULL;
+  CpvAccess(localProcChkpBuf)[0] = NULL;
+  CpvAccess(localProcChkpBuf)[1] = NULL;
   CpvAccess(curPointer) = 0;
   CpvAccess(recvdLocal) = 0;
   CpvAccess(recvdRemote) = 0;
index 6b083e3181e8f4c32b260ee77fabce971af249c0..a7214aa8fb31fac2e79f65a9b8dc8aac007522a4 100644 (file)
@@ -7,6 +7,9 @@ module CkMemCheckpoint {
   message CkArrayCheckPTMessage {
         double packData[];
   };    
+  message CkCheckPTMessage {
+        double packData[];
+  };    
   message CkProcCheckPTMessage {
         char packData[];
   };    
@@ -33,7 +36,9 @@ module CkMemCheckpoint {
        entry [reductiontarget] void recoverArrayElements();
        entry [reductiontarget] void finishUp();
        entry [reductiontarget] void doneRComparison(bool);
+       entry [reductiontarget] void recoverFromSoftFailure();
        entry void doneComparison(bool);
+       entry void RollBack();
        entry void gotReply();
        entry void quiescence(CkCallback&);
         entry void inmem_restore(CkArrayCheckPTMessage *m);
index 7774f77bcd28bb90b7d8b52393c3ae88bb2920a7..496146a364c6dec9843d1b1594ce03bc7c99b12e 100644 (file)
@@ -21,7 +21,7 @@ public:
        int cp_flag;          // 1: from checkpoint 0: from recover
 };
 
-class CkCheckPTMessage: public CMessage_CkArrayCheckPTMessage {
+class CkCheckPTMessage: public CMessage_CkCheckPTMessage {
 public:
        double *packData;
        int bud1, bud2;
@@ -97,6 +97,8 @@ public:
   void startCheckpoint();
   void doneComparison(bool);
   void doneRComparison(bool);
+  void RollBack();
+  void recoverFromSoftFailure();
 public:
   static CkCallback  cpCallback;
 
index f8bc45e63e5252547f820fd992e894831512ef53..4ad1f65fec66d43254f31374f45b2483b58e62f5 100644 (file)
@@ -70,8 +70,8 @@ waits for the migrant contributions to straggle in.
 #define DEBREVAC(x) CkPrintf x
 #else
 //No debugging info-- empty defines
-#define DEBR(x) // CkPrintf x
-#define DEBRMLOG(x) CkPrintf x
+#define DEBR(x) //CkPrintf x
+#define DEBRMLOG(x) //CkPrintf x
 #define AA
 #define AB
 #define DEBN(x) //CkPrintf x