minor
authorXiang Ni <xiangni2@illinois.edu>
Wed, 26 Dec 2012 17:03:18 +0000 (11:03 -0600)
committerXiang Ni <xiangni2@illinois.edu>
Wed, 26 Dec 2012 17:03:18 +0000 (11:03 -0600)
src/ck-core/ckmemcheckpoint.C
src/ck-ldb/LBDBManager.C
src/util/pup_util.C

index f0b58bdc3502783100ee7b6589ae0bee0e80b13b..7ee9c0a0b5a5a439382ae844a997ade743bdb4c2 100644 (file)
@@ -691,10 +691,9 @@ void CkMemCheckPT::startCheckpoint(){
                pupAllElements(p);
        }
        pointer = CpvAccess(curPointer);
-       if(CkMyPe() == CpvAccess(_remoteCrashedNode))
-    CkPrintf("start checkpointing!!!!\n");
        if(CpvAccess(chkpBuf)[pointer]) delete CpvAccess(chkpBuf)[pointer];
                CpvAccess(chkpBuf)[pointer] = msg;
+       CkPrintf("[%d][%d] local checkpoint done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
        if(CkReplicaAlive()==1){
                CpvAccess(recvdLocal) = 1;
                envelope * env = (envelope *)(UsrToEnv((CkCheckPTMessage *)CkCopyMsg((void **)&msg)));
@@ -711,6 +710,8 @@ void CkMemCheckPT::startCheckpoint(){
                  CkPrintf("[%d][%d] failed the test\n",CmiMyPartition(),CkMyPe());
                thisProxy[CkMyPe()].doneComparison(false);
          }
+               CkPrintf("[%d][%d] comparison done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
+         delete CpvAccess(buddyBuf);
   }
        else{
                if(CkReplicaAlive()==0){//TODO add flag if sent already but the replica hasn't recovered when the next checkpoint
@@ -750,7 +751,6 @@ void CkMemCheckPT::doneComparison(bool ret){
        }else{
                _ret = 1;
        }
-       inCheckpointing = 0;
        CkCallback cb(CkReductionTarget(CkMemCheckPT,doneRComparison),thisProxy);
        contribute(sizeof(int),&_ret,CkReduction::sum_int,cb);
 }
@@ -761,6 +761,7 @@ void CkMemCheckPT::doneRComparison(int ret){
 //     if(CpvAccess(curPointer) == 0){
        if(ret==CkNumPes()){
        CpvAccess(curPointer)^=1;
+               inCheckpointing = 0;
                if(CkMyPe() == 0){
                CkPrintf("[%d][%d] Checkpoint finished in %f seconds, sending callback ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime);
                }
@@ -1560,14 +1561,11 @@ static void restartBcastHandler(char *msg)
 extern void _initDone();
 
 bool compare(char * buf1, char *buf2){
-       //buf1 my copy, buf2 from another one 
-//     CkPrintf("[%d][%d]compare buffer\n",CmiMyPartition(),CkMyPe());
        PUP::checker pchecker(buf1,buf2);
        pchecker.skip();
        
        int numElements;
        pchecker|numElements;
-//     CkPrintf("[%d][%d]numElements:%d\n",CmiMyPartition(),CkMyPe(),numElements);
        for(int i=0;i<numElements;i++){
        //for(int i=0;i<1;i++){
                CkGroupID gID;
@@ -1576,12 +1574,11 @@ bool compare(char * buf1, char *buf2){
                pchecker|gID;
                pchecker|idx;
                
-//             CkPrintf("[%d][%d]resume\n",CmiMyPartition(),CkMyPe());
                CkLocMgr * mgr = (CkLocMgr *)CkpvAccess(_groupTable)->find(gID).getObj();
                mgr->resume(idx,pchecker,CmiFalse,CmiFalse,CmiFalse);
-//             CkPrintf("------[%d][%d]finish element %d\n",CmiMyPartition(),CkMyPe(),i);
        }
        return pchecker.getResult();
+       //return true;
 }
 
 static void recvRemoteChkpHandler(char *msg){
@@ -1591,14 +1588,15 @@ static void recvRemoteChkpHandler(char *msg){
   if(CpvAccess(recvdLocal)==1){
          int pointer = CpvAccess(curPointer);
          int size = CpvAccess(chkpBuf)[pointer]->len;
-         CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
          if(chkpMsg->len == size && compare((char *)(CpvAccess(chkpBuf)[pointer]->packData),(char *)(chkpMsg->packData))){
-                       checkptMgr[CkMyPe()].doneComparison(true);
+         CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(true);
          }else
          {
                  CkPrintf("[%d][%d] failed the test\n",CmiMyPartition(),CkMyPe());
-                       checkptMgr[CkMyPe()].doneComparison(false);
+         CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(false);
          }
+         delete chkpMsg;
+         CmiPrintf("[%d][%d] comparison done at %lf\n",CmiMyPartition(),CkMyPe(),CmiWallTimer());
   }else{
          CpvAccess(recvdRemote) = 1;
          if(CpvAccess(buddyBuf)) delete CpvAccess(buddyBuf);
@@ -1611,6 +1609,8 @@ static void replicaRecoverHandler(char *msg){
        CkMemCheckPT::replicaAlive = 1;
     bool ret = true;
     CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(ret);
+       CmiFree(msg);
+       
 }
 
 static void replicaDieHandler(char * msg){
@@ -1625,10 +1625,8 @@ static void replicaDieHandler(char * msg){
     }
 #endif
        //broadcast to my partition to get local max iter
-    if(CkMyPe()==diePe){
-               CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
-               checkptMgr.getIter();
-       }
+   CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->getIter();
+       CmiFree(msg);
 }
 
 
@@ -1636,6 +1634,7 @@ static void replicaDieBcastHandler(char *msg){
        int diePe = *(int *)(msg+CmiMsgHeaderSizeBytes);
        CpvAccess(_remoteCrashedNode) = diePe;
        CkMemCheckPT::replicaAlive = 0;
+       CmiFree(msg);
 }
 
 static void recoverRemoteProcDataHandler(char *msg){
@@ -1698,17 +1697,7 @@ static void recvPhaseHandler(char * msg)
 {
        CpvAccess(_curRestartPhase)--;
        CkMemCheckPT::inRestarting = 1;
-       //CmiPrintf("[%d] ---received phase %d\n",CkMyPe(),CpvAccess(_curRestartPhase));
-  // CkMemCheckPT *obj = CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch();
-  // if (CmiMyPe() == obj->BuddyPE(CpvAccess(_crashedNode)))  {
-  //    if(CmiMyPartition()==1&&CkMyPe()==2){
-//             CmiPrintf("start ping check handler\n");
-//      }
-       // CcdCallOnCondition(CcdPERIODIC_5s,(CcdVoidFn)pingCheckHandler,NULL);
-  // }
-   //CcdCallOnCondition(CcdPERIODIC_100ms,(CcdVoidFn)pingBuddy,NULL);
-   //CcdCallOnCondition(CcdPERIODIC_5s,(CcdVoidFn)pingCheckHandler,NULL);
-
+       CmiFree(msg);
 }
 // called on crashed processor
 static void recoverProcDataHandler(char *msg)
@@ -1996,7 +1985,7 @@ void pingCheckHandler()
 {
 #if CMK_MEM_CHECKPOINT
   double now = CmiWallTimer();
-  if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
+  if (lastPingTime > 0 && now - lastPingTime > 3 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
   //if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb()) {
     int i, pe, buddy;
     // tell everyone the buddy dies
@@ -2007,13 +1996,6 @@ void pingCheckHandler()
     }
     buddy = pe;
     CmiPrintf("[%d][%d] detected buddy processor %d died %f %f. \n",CmiMyPartition(), CmiMyPe(), buddy, now, lastPingTime);
-    /*for (int pe = 0; pe < CmiNumPes(); pe++) {
-      if (obj->isFailed(pe) || pe == buddy) continue;
-      char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
-      *(int *)(msg+CmiMsgHeaderSizeBytes) = buddy;
-      CmiSetHandler(msg, buddyDieHandlerIdx);
-      CmiSyncSendAndFree(pe, CmiMsgHeaderSizeBytes+sizeof(int), (char *)msg);
-    }*/
     char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
     *(int *)(msg+CmiMsgHeaderSizeBytes) = buddy;
     CmiSetHandler(msg, buddyDieHandlerIdx);
index 3dc6a82c27c42694a7ebcff17c6c755290a6187f..01d0be4d59e251323797868bfd337867e6c37fcf 100644 (file)
@@ -20,6 +20,7 @@ struct MigrateCB;
 //Called periodically-- starts next load balancing cycle
 void LBDB::batsyncer::gotoSync(void *bs)
 {
+       CkPrintf("[%d][%d] go to sync\n",CmiMyPartition(),CkMyPe());
   LBDB::batsyncer *s=(LBDB::batsyncer *)bs;
   s->db->AtLocalBarrier(s->BH);
 }
@@ -35,7 +36,7 @@ void LBDB::batsyncer::resumeFromSync(void *bs)
   s->nextT = curT + s->period;
 #endif
 
-  CcdCallFnAfterOnPE((CcdVoidFn)gotoSync, (void *)s, 1000*s->period, CkMyPe());
+  CcdCallFnAfterOnPE((CcdVoidFn)gotoSync, (void *)s, 10000000000*s->period, CkMyPe());
 }
 
 // initPeriod in seconds
index 22ea0ac14200a6eac2a07fe226a6f87d8cd93f38..cc75769261e6589104a8714dc968d53edbbc2289 100644 (file)
@@ -158,6 +158,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                                                        result = result && false;
                                                }
                                        }
+                                       delete [] p2;
                                }       
                                break;  
                        case Tint:
@@ -172,10 +173,8 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                                                                printf("found incorrect int %d %d\n",p1[i],p2[i]);
                                                        result = result && false;
                                                }
-                                               //printf("p1 %d\n",p1[i]);
-                                               //printf("p2 %d\n",p2[i]);
                                        }
-                                       //printf("p %d\n",*(int *)p);
+                                       delete [] p2;
                                }
                                break;
                        case Tchar:
@@ -190,10 +189,8 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                                                                printf("found incorrect char %c %c\n",p1[i],p2[i]);
                                                        result = result && false;
                                                }
-                                               //printf("p1 %d\n",p1[i]);
-                                               //printf("p2 %d\n",p2[i]);
                                        }
-                                       //printf("p %d\n",*(int *)p);
+                                       delete [] p2;
                                }
                                break;
                        default: