suppot for ampi
authorXiang Ni <xiangni2@illinois.edu>
Fri, 21 Dec 2012 04:14:35 +0000 (22:14 -0600)
committerXiang Ni <xiangni2@illinois.edu>
Fri, 21 Dec 2012 04:14:35 +0000 (22:14 -0600)
20 files changed:
src/arch/mpi/machine.c
src/ck-core/ckcheckpoint.C
src/ck-core/ckcheckpoint.h
src/ck-core/cklocation.C
src/ck-core/cklocation.h
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.ci
src/ck-core/ckmemcheckpoint.h
src/ck-core/mpi-interoperate.C
src/conv-core/isomalloc.c
src/libs/ck-libs/ampi/ampi.C
src/libs/ck-libs/ampi/ampi.h
src/libs/ck-libs/ampi/ampif.C
src/libs/ck-libs/ampi/ampiimpl.h
src/libs/ck-libs/tcharm/tcharm.C
src/libs/ck-libs/tcharm/tcharm.ci
src/libs/ck-libs/tcharm/tcharm_impl.h
src/libs/ck-libs/tcharm/tcharmc.h
src/libs/ck-libs/tcharm/tcharmf.h
src/util/pup_util.C

index cab2d52388a499b67413a40e5bf467b491615d24..c04604be98d3c9f8a2fde3f4d2371737b3d87cc0 100644 (file)
@@ -1366,7 +1366,16 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) {
     nextrank = num_workpes;
 
     if (*myNodeID >= num_workpes) {    /* is spare processor */
-      MPI_Status sts;
+      //if isomalloc_sync call mpi_barrier
+               if(CmiGetArgFlag(largv,"+isomalloc_sync")){
+                 CmiPrintf("wp before mpi barrier\n");
+          MPI_Barrier(charmComm);
+          MPI_Barrier(charmComm);
+          MPI_Barrier(charmComm);
+          MPI_Barrier(charmComm);
+                 CmiPrintf("wp after mpi barrier\n");
+         }
+         MPI_Status sts;
       int vals[2];
       MPI_Recv(vals,2,MPI_INT,MPI_ANY_SOURCE,FAIL_TAG, charmComm,&sts);
       int newpe = vals[0];
@@ -1397,6 +1406,10 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) {
                 i++;
       }
       restart_argv[i] = "+restartaftercrash";
+         if(CmiGetArgFlagDesc(largv,"+isomalloc_sync","synchronize isomalloc region globaly")){
+               i++;
+       restart_argv[i] = "+restartisomalloc";
+         }
       phase_str = (char*)malloc(10);
       sprintf(phase_str,"%d", CpvAccess(_curRestartPhase));
       restart_argv[i+1]=phase_str;
@@ -1848,9 +1861,10 @@ int CmiBarrier() {
          *  and END_EVENT are disabled here. -Chao Mei
          */
         /*START_EVENT();*/
-
+               CmiPrintf("before mpi_barrier\n");
         if (MPI_SUCCESS != MPI_Barrier(charmComm))
             CmiAbort("Timernit: MPI_Barrier failed!\n");
+               CmiPrintf("after mpi_barrier\n");
 
         /*END_EVENT(10);*/
     }
index 0bc2844f9dce4644ff27bccb3a5c2d0a7183989e..e0e84d50f6b6811994e919688f71e09fc74dec99 100644 (file)
@@ -431,6 +431,15 @@ int  CkCountArrayElements(){
   int numElements = counter.getCount();
     return numElements;
 }
+
+int  CkCountChkpSyncElements(){
+    int numGroups = CkpvAccess(_groupIDTable)->size();
+    int i;
+    ElementCounter  counter;
+    CKLOCMGR_LOOP(mgr->iterateChkpSync(counter););
+  int numElements = counter.getCount();
+    return numElements;
+}
 #endif
 
 void CkPupProcessorData(PUP::er &p)
index 0af58de89b19af8b5aecd0b1c0d38da26a430f0b..28b1113ee16b6f68371b82a35fe9f96fda582697 100644 (file)
@@ -71,6 +71,7 @@ void CkStartCheckpoint(const char* dirname,const CkCallback& cb);
 void CkRestartMain(const char* dirname, CkArgMsg *args);
 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) || CMK_MEM_CHECKPOINT
 int  CkCountArrayElements();
+int  CkCountChkpSyncElements();
 #endif
 
 // some useful flags (for disk checkpointing)
index c9f5f1b112456805a6994d81cb0bd02e6077a077..4db992347e673c443c6799db187ff9c1c6251d60 100644 (file)
@@ -1050,6 +1050,7 @@ void CkMigratable::commonInit(void) {
        thisChareType=i.chareType;
        usesAtSync=CmiFalse;
        usesAutoMeasure=CmiTrue;
+       usesChkpAtSync = CmiFalse;
        barrierRegistered=CmiFalse;
 
   local_state = OFF;
@@ -1084,6 +1085,7 @@ void CkMigratable::pup(PUP::er &p) {
        p|thisIndexMax;
        p(usesAtSync);
   p(can_reset);
+    p(usesChkpAtSync);
        p(usesAutoMeasure);
 #if CMK_LBDB_ON 
        int readyMigrate;
@@ -1235,41 +1237,44 @@ void CkMigratable::setChkpResumeClient(CkCallback & _cb)
        nextChkpDecided = false;
        atsync_chkp_iter = -1;
        local_chkp_pause = false;
+       usesChkpAtSync = CmiTrue;
 }
 
 void CkMigratable::AtChkpSync()
 {
-       if(CmiNumPartition()==1){
-               chkp_cb.send();
-               return;
-       }
-       atsync_chkp_iter++;
-       myRec->getChkpMgr()->recvIter(atsync_chkp_iter);
-       //reduction to decide the current maximum
-       if(nextChkpDecided){
-               if(atsync_chkp_iter<nextChkpIter){
+       if(usesChkpAtSync){
+               if(CmiNumPartition()==1){
                        chkp_cb.send();
-               }else if(atsync_chkp_iter == nextChkpIter){
-                       resetForChkp();
-                       myRec->getChkpMgr()->reachChkpIter();
-               }else{
-                       CkAbort("Impossible state\n");
-               }
-       }
-       else if(myRec->getChkpMgr()->localDecided){
-               int localIter = myRec->getChkpMgr()->localMaxIter;
-               if(atsync_chkp_iter==localIter){
-                       local_chkp_pause = true;
+                       return;
                }
-               else if(atsync_chkp_iter<localIter){
-                       chkp_cb.send();
+               atsync_chkp_iter++;
+               myRec->getChkpMgr()->recvIter(atsync_chkp_iter);
+               //reduction to decide the current maximum
+               if(nextChkpDecided){
+                       if(atsync_chkp_iter<nextChkpIter){
+                               chkp_cb.send();
+                       }else if(atsync_chkp_iter == nextChkpIter){
+                               resetForChkp();
+                               myRec->getChkpMgr()->reachChkpIter();
+                       }else{
+                               CkAbort("Impossible state\n");
+                       }
                }
+               else if(myRec->getChkpMgr()->localDecided){
+                       int localIter = myRec->getChkpMgr()->localMaxIter;
+                       if(atsync_chkp_iter==localIter){
+                               local_chkp_pause = true;
+                       }
+                       else if(atsync_chkp_iter<localIter){
+                               chkp_cb.send();
+                       }
+                       else{
+                               CkAbort("local Impossible state\n");
+                       }
+               }       
                else{
-                       CkAbort("local Impossible state\n");
+                       chkp_cb.send();
                }
-       }       
-       else{
-               chkp_cb.send();
        }
 }
 
@@ -1279,23 +1284,31 @@ void CkMigratable::resetForChkp(){
 }
 
 void CkMigratable::ResumeFromChkp(){
-       chkp_cb.send();
+//     if(!chkp_cb.isInvalid())
+       if(usesChkpAtSync){     
+               chkp_cb.send();
+       }
 }
 
 void CkMigratable::recvChkpIter(void * _iter){
-       int iter = *(int *)_iter;
-       nextChkpIter = iter;
-       nextChkpDecided = true;
-       if(atsync_chkp_iter>nextChkpIter){
-               CkAbort("impossible state in notify\n");
-       }
-       else if(atsync_chkp_iter==nextChkpIter){
-               resetForChkp();
-               myRec->getChkpMgr()->reachChkpIter();
-       }
-       else{
-               if(local_chkp_pause)
-                       chkp_cb.send(); 
+       if(usesChkpAtSync){
+               CkPrintf("[%d] recv chkp iter\n",CkMyPe());
+               int iter = *(int *)_iter;
+               nextChkpIter = iter;
+               nextChkpDecided = true;
+               if(atsync_chkp_iter>nextChkpIter){
+                       CkAbort("impossible state in notify\n");
+               }
+               else if(atsync_chkp_iter==nextChkpIter){
+                       resetForChkp();
+                       myRec->getChkpMgr()->reachChkpIter();
+               }
+               else{
+                       if(local_chkp_pause){
+                               if(!chkp_cb.isInvalid())
+                                       chkp_cb.send(); 
+                       }
+               }
        }
 }
 
@@ -2833,6 +2846,29 @@ void CkLocMgr::iterate(CkLocIterator &dest) {
 }
 
 
+void CkLocMgr::iterateChkpSync(CkLocIterator &dest) {
+  //Poke through the hash table for local ArrayRecs.
+  void *objp;
+  CkHashtableIterator *it=hash.iterator();
+  CmiImmediateLock(hashImmLock);
+
+  while (NULL!=(objp=it->next())) {
+    CkLocRec *rec=*(CkLocRec **)objp;
+    if (rec->type()==CkLocRec::local) {
+      CkLocation loc(this,(CkLocRec_local *)rec);
+         int localIdx=((CkLocRec_local *)rec)->getLocalIndex();
+         for (ManagerRec *m=firstManager;m!=NULL;m=m->next) {
+               CkMigratable *el=m->element(localIdx);
+       if(el->getChkpSync()==CmiTrue){
+                       dest.addLocation(loc);
+                       break;
+               }
+         }
+    }
+  }
+  CmiImmediateUnlock(hashImmLock);
+  delete it;
+}
 
 
 /************************** LocMgr: MIGRATION *************************/
index 936ef59f6b651e11d665ed9240edd9b18ef36c09..2e8d90c7c2028746dddc493ad687504258204b9a 100644 (file)
@@ -356,13 +356,13 @@ protected:
   CmiBool usesAtSync;//You must set this in the constructor to use AtSync().
   CmiBool usesAutoMeasure; //You must set this to use auto lb instrumentation.
   CmiBool barrierRegistered;//True iff barrier handle below is set
-
+  CmiBool usesChkpAtSync;
 public:
   virtual void ResumeFromSync(void);
   virtual void UserSetLBLoad(void);  /// user define this when setLBLoad is true
   void setObjTime(double cputime);
   double getObjTime();
-
+  CmiBool getChkpSync(){return usesChkpAtSync;}
 #if CMK_LBDB_ON  //For load balancing:
   
   void AtSync(int waitForMigration=1);
@@ -662,6 +662,8 @@ public:
 
        /// Pass each of our locations (each separate array index) to this destination.
        void iterate(CkLocIterator &dest);
+       
+       void iterateChkpSync(CkLocIterator &dest);
 
        /// Insert and unpack this array element from this checkpoint (e.g., from CkLocation::pup), skip listeners
        void restore(const CkArrayIndex &idx, PUP::er &p);
index 5b4a1b24c1f6886b9e48d298710e4de105eb8b2f..642a110a93ad102f413aee8a2fedc0cc6a491531 100644 (file)
@@ -419,6 +419,10 @@ void CkMemCheckPT::getIter(){
        localDecided = true;
        localMaxIter = maxIter+1;
        contribute(sizeof(int),&localMaxIter,CkReduction::max_int,CkCallback(CkReductionTarget(CkMemCheckPT,recvMaxIter),thisProxy));
+       int elemCount = CkCountChkpSyncElements();
+       if(elemCount == 0){
+               contribute(CkCallback(CkReductionTarget(CkMemCheckPT,startChkp),thisProxy[0]));
+       }
 }
 
 //when one replica failes, decide the next chkp iter
@@ -436,7 +440,7 @@ void CkMemCheckPT::recvMaxIter(int iter){
 
 void CkMemCheckPT::reachChkpIter(){
        recvIterCount++;
-       elemCount = CkCountArrayElements();
+       elemCount = CkCountChkpSyncElements();
        if(recvIterCount == elemCount){
                recvIterCount = 0;
                contribute(CkCallback(CkReductionTarget(CkMemCheckPT,startChkp),thisProxy[0]));
@@ -444,6 +448,7 @@ void CkMemCheckPT::reachChkpIter(){
 }
 
 void CkMemCheckPT::startChkp(){
+       CkPrintf("start checkpoint\n");
        CkStartMemCheckpoint(cpCallback);
 }
 
@@ -661,8 +666,7 @@ void CkMemCheckPT::startCheckpoint(){
     _handleProcData(p,CmiFalse);
     size = p.size();
   }
-       int packSize = size/sizeof(double)+1;
-       CkCheckPTMessage * procMsg = new (packSize,0) CkCheckPTMessage;
+       CkCheckPTMessage * procMsg = new (size,0) CkCheckPTMessage;
        procMsg->len = size;
        procMsg->cp_flag = 1;
        {
@@ -678,8 +682,7 @@ void CkMemCheckPT::startCheckpoint(){
                pupAllElements(psizer);
                size = psizer.size();
        }
-       packSize = size/sizeof(double)+1;
-       CkCheckPTMessage * msg = new (packSize,0) CkCheckPTMessage;
+       CkCheckPTMessage * msg = new (size,0) CkCheckPTMessage;
        msg->len = size;
        msg->cp_flag = 1;
        {
@@ -739,7 +742,7 @@ void CkMemCheckPT::startCheckpoint(){
 }
 
 void CkMemCheckPT::doneComparison(bool ret){
-       int _ret;
+       int _ret = 1;
        if(!ret){
        CkPrintf("[%d][%d] fail in doneComparison \n", CmiMyPartition(),CkMyPe());
                _ret = 0;
@@ -747,15 +750,16 @@ void CkMemCheckPT::doneComparison(bool ret){
                _ret = 1;
        }
        inCheckpointing = 0;
+     CkPrintf("[%d][%d] contribute %d \n", CmiMyPartition(),CkMyPe(),_ret);
        CkCallback cb(CkReductionTarget(CkMemCheckPT,doneRComparison),thisProxy);
-       contribute(sizeof(int),&_ret,CkReduction::logical_and,cb);
+       contribute(sizeof(int),&_ret,CkReduction::sum_int,cb);
 }
 
 void CkMemCheckPT::doneRComparison(int ret){
        CpvAccess(recvdRemote) = 0;
        CpvAccess(recvdLocal) = 0;
 //     if(CpvAccess(curPointer) == 0){
-       if(ret==1){
+       if(ret==CkNumPes()){
        CpvAccess(curPointer)^=1;
                if(CkMyPe() == 0){
                CkPrintf("[%d][%d] Checkpoint finished in %f seconds, sending callback ... \n", CmiMyPartition(),CkMyPe(), CmiWallTimer()-startTime);
@@ -763,7 +767,7 @@ void CkMemCheckPT::doneRComparison(int ret){
                CKLOCMGR_LOOP(mgr->resumeFromChkp(););
        }
        else{
-       CkPrintf("[%d][%d] going to RollBack \n", CmiMyPartition(),CkMyPe());
+       CkPrintf("[%d][%d] going to RollBack %d \n", CmiMyPartition(),CkMyPe(),ret);
                RollBack();
        }
 }
@@ -1280,10 +1284,10 @@ void CkMemCheckPT::recoverArrayElements()
     count ++;
   }
 #else
-       double * packData;
+       char * packData;
        if(CmiNumPartition()==1){
                CkArrayCheckPTMessage * msg = (CkArrayCheckPTMessage *)CkCopyMsg((void **)&chkpTable[0]);
-               packData = msg->packData;
+               packData = (char *)msg->packData;
        }
        else{
                int pointer = CpvAccess(curPointer);
@@ -1332,7 +1336,7 @@ void CkMemCheckPT::gotReply(){
     contribute(CkCallback(CkReductionTarget(CkMemCheckPT, finishUp), thisProxy));
 }
 
-void CkMemCheckPT::recoverAll(double * packData,CkVec<CkGroupID> * gmap, CkVec<CkArrayIndex> * imap){
+void CkMemCheckPT::recoverAll(char * packData,CkVec<CkGroupID> * gmap, CkVec<CkArrayIndex> * imap){
 #if CMK_CHKP_ALL
        PUP::fromMem p(packData);
        int numElements;
@@ -1611,8 +1615,7 @@ static void replicaRecoverHandler(char *msg){
        CpvAccess(_remoteCrashedNode) = -1;
        CkMemCheckPT::replicaAlive = 1;
     bool ret = true;
-       CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
-       checkptMgr[CkMyPe()].doneComparison(ret);
+    CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->doneComparison(ret);
 }
 
 static void replicaDieHandler(char * msg){
@@ -1631,8 +1634,6 @@ static void replicaDieHandler(char * msg){
                CProxy_CkMemCheckPT checkptMgr(ckCheckPTGroupID);
                checkptMgr.getIter();
        }
-    //CmiSetHandler(msg, replicaDieBcastHandlerIdx);
-    //CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes+sizeof(int), (char *)msg);
 }
 
 
@@ -2005,7 +2006,7 @@ void pingCheckHandler()
 {
 #if CMK_MEM_CHECKPOINT
   double now = CmiWallTimer();
-  if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
+  if (lastPingTime > 0 && now - lastPingTime > 4 && !CkInLdb() && !CkInRestarting() && !CkInCheckpointing()) {
   //if (lastPingTime > 0 && now - lastPingTime > 2 && !CkInLdb()) {
     int i, pe, buddy;
     // tell everyone the buddy dies
index 2458de71275d281bc36729814295bcae937e0086..2628656e37c2d7f3c7a8e02d0680fd0470e20cb5 100644 (file)
@@ -8,7 +8,7 @@ module CkMemCheckpoint {
         double packData[];
   };    
   message CkCheckPTMessage {
-        double packData[];
+        char packData[];
   };    
   message CkProcCheckPTMessage {
         char packData[];
index fd5b73f0c390d058d41a819f9da5cc7d0f284697..1ab72a517a0a1fb6f76c2c4e9d663a8fdd3cd57e 100644 (file)
@@ -23,7 +23,7 @@ public:
 
 class CkCheckPTMessage: public CMessage_CkCheckPTMessage {
 public:
-       double *packData;
+       char *packData;
        int bud1, bud2;
        int len;
        int pointer;
@@ -94,7 +94,7 @@ public:
   //void pupAllElements(PUP::er &p);
   void startArrayCheckpoint();
   void recvArrayCheckpoint(CkArrayCheckPTMessage *m);
-  void recoverAll(double * msg, CkVec<CkGroupID> * gmap=NULL, CkVec<CkArrayIndex> * imap=NULL);
+  void recoverAll(char * msg, CkVec<CkGroupID> * gmap=NULL, CkVec<CkArrayIndex> * imap=NULL);
   void startCheckpoint();
   void doneComparison(bool);
   void doneRComparison(int);
index 5dc6182cdf9f37c64cde94586381760c07cff79c..a1713dbe089c765d50e381b2c74947c5ae4090fc 100644 (file)
@@ -81,12 +81,12 @@ void _libExitHandler(envelope *env)
 extern "C"
 void CharmLibInit(MPI_Comm userComm, int argc, char **argv){
        //note CmiNumNodes and CmiMyNode should just be macros
-  charmComm = userComm;
+/*  charmComm = userComm;
   MPI_Comm_size(charmComm, &_Cmi_numnodes);
   MPI_Comm_rank(charmComm, &_Cmi_mynode);
 
        CharmLibInterOperate = 1;
-       ConverseInit(argc, argv, (CmiStartFn)_initCharm, 1, 0);
+       ConverseInit(argc, argv, (CmiStartFn)_initCharm, 1, 0);*/
 }
 #else
 extern "C"
index 375c500db3ffcd5c9ff036ef8f85db5bd230d013..2bb988bcf2af766212bb09dad6f385f6f746e935 100644 (file)
@@ -2031,10 +2031,10 @@ static void init_ranges(char **argv)
     pagesize = CMK_MEMORY_PAGESIZE;
   slotsize=(slotsize+pagesize-1) & ~(pagesize-1);
 
-#if ISOMALLOC_DEBUG
+//#if ISOMALLOC_DEBUG
   if (CmiMyPe() == 0)
     CmiPrintf("[%d] Using slotsize of %d\n", CmiMyPe(), slotsize);
-#endif
+//#endif
   freeRegion.len=0u;
 
   if (CmiMyRank()==0 && numslots==0)
@@ -2064,16 +2064,20 @@ static void init_ranges(char **argv)
     }
     else /* freeRegion.len>0, so can isomalloc */
     {
-#if ISOMALLOC_DEBUG
+//#if ISOMALLOC_DEBUG
       CmiPrintf("[%d] Isomalloc memory region: %p - %p (%d megs)\n",CmiMyPe(),
           freeRegion.start,freeRegion.start+freeRegion.len,
           freeRegion.len/meg);
-#endif
+//#endif
     }
   }             /* end if myrank == 0 */
-
+CmiPrintf("before barrier\n");
+#ifdef __FAULT__
   CmiNodeAllBarrier();
-
+#else 
+  CmiNodeAllBarrier();
+#endif
+CmiPrintf("after barrier\n");
   /*
      on some machines, isomalloc memory regions on different nodes 
      can be different. use +isomalloc_sync to calculate the
@@ -2109,10 +2113,13 @@ static void init_ranges(char **argv)
             goto AFTER_SYNC;
         }
 #endif
-    if (CmiMyRank() == 0 && freeRegion.len > 0u) {
+    CmiPrintf("before cmibarrier\n");
+       if (CmiMyRank() == 0 && freeRegion.len > 0u) {
+       CmiPrintf("before cmibarrier test\n");
       if (CmiBarrier() == -1 && CmiMyPe()==0) 
         CmiAbort("Charm++ Error> +isomalloc_sync requires CmiBarrier() implemented.\n");
-      else {
+      else
+         {
         CmiUInt8 s = (CmiUInt8)freeRegion.start;
         CmiUInt8 e = (CmiUInt8)(freeRegion.start+freeRegion.len);
         int fd, i;
@@ -2166,10 +2173,10 @@ static void init_ranges(char **argv)
           }
           read(fd, &ss, sizeof(CmiUInt8));
           read(fd, &ee, sizeof(CmiUInt8));
-#if ISOMALLOC_DEBUG
+//#if ISOMALLOC_DEBUG
           if (CmiMyPe() == 0) CmiPrintf("[%d] load node %d isomalloc region: %lx %lx. \n",
               CmiMyPe(), i, ss, ee);
-#endif
+//#endif
           close(fd);
           if (ss>s) s = ss;
           if (ee<e) e = ee;
@@ -2208,8 +2215,10 @@ static void init_ranges(char **argv)
                 }
 #endif
       }   /* end of barrier test */
+         CmiPrintf("end of rank0\n");
     } /* end of rank 0 */
     else {
+         CmiPrintf("another branch\n");
       CmiBarrier();
       CmiBarrier();
       CmiBarrier();
@@ -2228,10 +2237,10 @@ static void init_ranges(char **argv)
     isomallocEnd=freeRegion.start+freeRegion.len;
     numslots=(freeRegion.len/slotsize)/CmiNumPes();
 
-#if ISOMALLOC_DEBUG
+//#if ISOMALLOC_DEBUG
     CmiPrintf("[%d] Can isomalloc up to %lu megs per pe\n",CmiMyPe(),
         ((memRange_t)numslots)*slotsize/meg);
-#endif
+//#endif
   }
 
   /*SMP Mode: wait here for rank 0 to initialize numslots before calculating myss*/
index 2e8f78353fd41b7608e2ee7c6f7f678d440c0968..551bc95c0313f83b700c59b47d7b502baaa44065 100644 (file)
@@ -1126,6 +1126,12 @@ void ampiParent::startCheckpoint(const char* dname){
 #endif
 }
 
+void ampiParent::ChkpSync(){
+       AtChkpSync();
+       CkPrintf("[%d] ampi parent go to sync\n",CkMyPe());
+       thread->stop();
+}
+
 void ampiParent::Checkpoint(int len, const char* dname){
   if (len == 0) {
     // memory checkpoint
@@ -1141,7 +1147,9 @@ void ampiParent::Checkpoint(int len, const char* dname){
   }
 }
 void ampiParent::ResumeThread(void){
-  thread->resume();
+  //thread->resume();
+  thread->start();
+  CkPrintf("[%d] ampi parent resume\n",CkMyPe());
 }
 
 int ampiParent::createKeyval(MPI_Copy_function *copy_fn, MPI_Delete_function *delete_fn,
@@ -5659,6 +5667,24 @@ void AMPI_MemCheckpoint()
 #endif
 }
 
+  CDECL
+void AMPI_ChkpSync()
+{
+  //AMPI_Barrier(MPI_COMM_WORLD);
+  //getAmpiParent()->ChkpSync();
+  TCHARM_ChkpSync();
+}
+
+  CDECL
+void AMPI_Die()
+{
+       int partition = rand()%2;
+  if(CmiMyPartition()==partition){
+       printf("partition %d to die %d\n",CmiMyPartition(),partition);
+       CkDieNow();
+  }
+}      
+
   CDECL
 void AMPI_Print(char *str)
 {
index f7fd52c74db156244880cd2697c18b2ed928b5f0..be26c398dfa1d5c12a272030697976974c24d32e 100644 (file)
@@ -632,6 +632,13 @@ void AMPI_Setmigratable(int comm, int mig);
 void AMPI_Checkpoint(char *dname);
 #define MPI_MemCheckpoint AMPI_MemCheckpoint
 void AMPI_MemCheckpoint();
+
+#define MPI_ChkpSync AMPI_ChkpSync
+void AMPI_ChkpSync();
+
+#define MPI_Die AMPI_Die
+void AMPI_Die();
+
 #define MPI_Get_userdata AMPI_Get_userdata
 void *AMPI_Get_userdata(int);
 #define MPI_Datatype_iscontig AMPI_Datatype_iscontig
index 05282f48474d2ba739506f8d674de5b7aec683b6..796ed00c5dd21f9191e328032aa6bfc3fc59f36e 100644 (file)
@@ -147,6 +147,8 @@ FDECL {
 #define mpi_setmigratable FTN_NAME (MPI_SETMIGRATABLE , mpi_setmigratable )
 #define mpi_checkpoint FTN_NAME( MPI_CHECKPOINT , mpi_checkpoint )
 #define mpi_memcheckpoint FTN_NAME( MPI_MEMCHECKPOINT , mpi_memcheckpoint )
+#define mpi_die FTN_NAME( MPI_DIE,mpi_die)
+#define mpi_chkpsync FTN_NAME( MPI_CHKPSYNC , mpi_chkpsync )   
 
 #define mpi_get_argc FTN_NAME( MPI_GET_ARGC , mpi_get_argc )
 #define mpi_get_argv FTN_NAME( MPI_GET_ARGV , mpi_get_argv )
@@ -902,6 +904,18 @@ void mpi_memcheckpoint(){
   AMPI_MemCheckpoint();
 }
 
+void mpi_die(){
+       int partition = rand()%2;
+  if(CmiMyPartition()==partition){
+       printf("partition %d to die %d\n",CmiMyPartition(),partition);
+       CkDieNow();
+  }
+}
+
+void mpi_chkpsync(){
+       AMPI_ChkpSync();
+}
+
 void mpi_get_argc(int *c, int *ierr)
 {
   *c = CkGetArgc();
index 60cc61f3116f997d3e0e4eea7672ad79c130637c..82ab6ba186caa60f87991efdfbaa8e5eec69bcca 100644 (file)
@@ -1134,6 +1134,7 @@ public:
     }
 
     void startCheckpoint(const char* dname);
+       void ChkpSync();
     void Checkpoint(int len, const char* dname);
     void ResumeThread(void);
     TCharm* getTCharmThread() {return thread;}
index 05068c36c58a644a0d5d0fa6998087ebdaee8132..46bdb1fb2be9a490eaa41f38b2bfd59739545a26 100644 (file)
@@ -196,6 +196,8 @@ TCharm::TCharm(TCharmInitMsg *initMsg_)
   nUd=0;
   usesAtSync=CmiTrue;
   run();
+  CkCallback cb(CkIndex_TCharm::ResumeFromChkpSync(),thisProxy(thisIndex));
+  setChkpResumeClient(cb);
 }
 
 TCharm::TCharm(CkMigrateMessage *msg)
@@ -498,6 +500,7 @@ void TCharm::start(void)
 #else
   CthAwaken(tid);
 #endif
+  DBG("thread resuming soon");
 }
 
 //Block our thread, schedule, and come back:
@@ -520,6 +523,16 @@ void TCharm::migrate(void)
 }
 
 
+void TCharm::chkpsync(void)
+{
+#if CMK_LBDB_ON
+  DBG("going to sync");
+  AtChkpSync();
+  stop();
+#else
+  DBG("skipping sync, because there is no load balancer");
+#endif
+}
 
 void TCharm::evacuate(){
        /*
@@ -574,8 +587,13 @@ void TCharm::ResumeFromSync(void)
   //if(isSelfDone) return;
   //if (exitWhenDone) return; //for bigsim ooc execution
   if (!skipResume) start();
+  CkPrintf("thread ResumeFromSync\n");
 }
 
+void TCharm::ResumeFromChkpSync(void)
+{
+  start();
+}
 
 /****** TcharmClient ******/
 void TCharmClient1D::ckJustMigrated(void) {
@@ -887,7 +905,20 @@ CDECL void TCHARM_Migrate(void)
         }
        TCharm::get()->migrate();
 }
+
+CDECL void TCHARM_ChkpSync(void)
+{
+       TCHARMAPI("TCHARM_ChkpSync");
+       if (CthMigratable() == 0) {
+         if(CkMyPe() == 0)
+           CkPrintf("Warning> thread migration is not supported!\n");
+          return;
+        }
+       TCharm::get()->chkpsync();
+}
+
 FORTRAN_AS_C(TCHARM_MIGRATE,TCHARM_Migrate,tcharm_migrate,(void),())
+FORTRAN_AS_C(TCHARM_CHKPSYNC,TCHARM_ChkpSync,tcharm_chkpsync,(void),())
 
 CDECL void TCHARM_Async_Migrate(void)
 {
index e168af46fb35877c4e0c7b04c951dd1bf81234c3..538bd5104cbba4b9bd07b41c3c61cd0e11538bbb 100644 (file)
@@ -9,7 +9,8 @@ module tcharm {
     entry void migrateDelayed(int destPE);
     entry void start(void);
     entry void callSystem(callSystemStruct s);
-    initproc void procInit(void);
+    entry void ResumeFromChkpSync();
+       initproc void procInit(void);
     initnode void nodeInit(void);
   };
 };
index b3fef0788d155293470b33089446a037ddc5cadc..e8a1f35816229a6698f5d701bff9ba7ed36fca5e 100644 (file)
@@ -126,6 +126,7 @@ class TCharm: public CBase_TCharm
 //One-time initialization
        static void nodeInit(void);
        static void procInit(void);
+       void ResumeFromChkpSync(void);
  private:
        //Informational data about the current thread:
        class ThreadInfo {
@@ -234,6 +235,7 @@ class TCharm: public CBase_TCharm
        void async_migrate(void);
        void allow_migrate(void);
 
+       void chkpsync();
        //Entering thread context: turn stuff on
        static void activateThread(void) {
                TCharm *tc=CtvAccess(_curTCharm);
index 076f9a3b83faa50944406d87dd09c9a0ff255af0..f0a8c98ea6cc7be2f60716c5f2c9a3b162ba029d 100644 (file)
@@ -45,6 +45,7 @@ int TCHARM_Element(void);
 int TCHARM_Num_elements(void);
 void TCHARM_Barrier(void);
 void TCHARM_Migrate(void);
+void TCHARM_ChkpSync(void);
 void TCHARM_Async_Migrate(void);
 void TCHARM_Allow_Migrate(void);
 void TCHARM_Migrate_to(int destPE);
index 2d47c5096a063d84fe2bd252f1e5add9e8366f8e..459592b25e3af67a07e58e12e81c2d79ac08aaff 100644 (file)
@@ -10,6 +10,7 @@
 
        external TCHARM_Register
        external TCHARM_Migrate
+       external TCHARM_ChkpSync
        external TCHARM_Migrate_to
        external TCHARM_Yield
        external TCHARM_Done
index d0fc07e7a3147d0666a84226aff3a294fead4ddf..22ea0ac14200a6eac2a07fe226a6f87d8cd93f38 100644 (file)
@@ -147,7 +147,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                switch(t){
                        case Tdouble:
                                {
-                                       double * p1 = new double[n/itemSize];
+                                       double * p1;
                                        double * p2 = new double[n/itemSize];
                                        p1 = (double*)p;
                                        memcpy((char *)p2,(const void *)buf,n); 
@@ -162,7 +162,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                                break;  
                        case Tint:
                                {
-                                       int * p1 = new int[n/itemSize];
+                                       int * p1;
                                        int * p2 = new int[n/itemSize];
                                        p1 = (int *)p;
                                        memcpy((char *)p2,(const void *)buf,n); 
@@ -180,7 +180,7 @@ void PUP::checker::bytes(void * p,int n,size_t itemSize,dataType t)
                                break;
                        case Tchar:
                                {
-                                       char * p1 = new char[n/itemSize];
+                                       char * p1;
                                        char * p2 = new char[n/itemSize];
                                        p1 = (char *)p;
                                        memcpy((char *)p2,(const void *)buf,n);