fix bug from last commit
authorXiang Ni <xiangni@hopper09.(none)>
Sun, 30 Dec 2012 06:12:18 +0000 (22:12 -0800)
committerXiang Ni <xiangni@hopper09.(none)>
Sun, 30 Dec 2012 06:12:18 +0000 (22:12 -0800)
src/arch/mpi/machine.c
src/ck-core/ckmemcheckpoint.C
src/conv-core/threads.c
src/libs/ck-libs/ampi/ampi.C
src/libs/ck-libs/tcharm/tcharm.C

index d5bbf4f89a65b708908958af0378bbbee3bcc639..d6e3c1669cca7bd93b6063e973c4a7b4d4d5a09f 100644 (file)
@@ -1947,6 +1947,8 @@ int find_spare_mpirank(int pe,int partition)
        CpvAccess(crashedRank) = newpe;
     petorank[newpe] = nextrank;
     nextrank++;
        CpvAccess(crashedRank) = newpe;
     petorank[newpe] = nextrank;
     nextrank++;
+    //CmiPrintf("[%d][%d]spare rank %d for pe %d\n",CmiMyPartition(),CmiMyPe(),nextrank-1,newpe);
+    //fflush(stdout);
     return nextrank-1;
 }
 
     return nextrank-1;
 }
 
index eeb3148b7307a4faa7977f35740609019a5ba47f..8fd099222988f0e5bde419ddb4e41449249736bd 100644 (file)
@@ -1417,13 +1417,8 @@ void CkMemCheckPT::RollBack(){
         CmiPrintf("[%d] Restart finished in %f seconds at %f.\n", CkMyPe(), CkWallTimer()-restartT, CkWallTimer());
         fflush(stdout);
       }
         CmiPrintf("[%d] Restart finished in %f seconds at %f.\n", CkMyPe(), CkWallTimer()-restartT, CkWallTimer());
         fflush(stdout);
       }
-      fflush(stdout);
       CKLOCMGR_LOOP(mgr->resumeFromChkp(););
       inRestarting = 0;
       CKLOCMGR_LOOP(mgr->resumeFromChkp(););
       inRestarting = 0;
-      if(CmiMyPartition()==0){
-        CmiPrintf("[%d] Resume Done\n", CkMyPe());
-        fflush(stdout);
-      }
 
 #if CMK_CONVERSE_MPI   
       if(CmiNumPartition()!=1){
 
 #if CMK_CONVERSE_MPI   
       if(CmiNumPartition()!=1){
@@ -1662,11 +1657,12 @@ void CkMemCheckPT::RollBack(){
       int diePe = *(int *)(msg+CmiMsgHeaderSizeBytes);
       CpvAccess(_remoteCrashedNode) = diePe;
       CkMemCheckPT::replicaAlive = 0;
       int diePe = *(int *)(msg+CmiMsgHeaderSizeBytes);
       CpvAccess(_remoteCrashedNode) = diePe;
       CkMemCheckPT::replicaAlive = 0;
-      find_spare_mpirank(diePe,CmiMyPartition()^1);
       if(CkMyPe()==diePe){
       if(CkMyPe()==diePe){
-        CkPrintf("pe %d in replicad word die\n",diePe);
-        CkPrintf("replicaAlive %d\n",CkMemCheckPT::replicaAlive);
+        CmiPrintf("pe %d in replicad word die\n",diePe);
+        CmiPrintf("replicaAlive %d\n",CkMemCheckPT::replicaAlive);
+        fflush(stdout);
       }
       }
+      find_spare_mpirank(diePe,CmiMyPartition()^1);
 #endif
       //broadcast to my partition to get local max iter
       CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->getIter();
 #endif
       //broadcast to my partition to get local max iter
       CProxy_CkMemCheckPT(ckCheckPTGroupID).ckLocalBranch()->getIter();
@@ -1825,6 +1821,7 @@ void CkMemCheckPT::RollBack(){
     void qd_callback(void *m)
     {
       CmiPrintf("[%d] callback after QD for crashed node: %d. at %lf\n", CkMyPe(), CpvAccess(_crashedNode),CmiWallTimer());
     void qd_callback(void *m)
     {
       CmiPrintf("[%d] callback after QD for crashed node: %d. at %lf\n", CkMyPe(), CpvAccess(_crashedNode),CmiWallTimer());
+        fflush(stdout);
       CkFreeMsg(m);
       if(CmiNumPartition()==1){
 #ifdef CMK_SMP
       CkFreeMsg(m);
       if(CmiNumPartition()==1){
 #ifdef CMK_SMP
@@ -1870,6 +1867,7 @@ void CkMemCheckPT::RollBack(){
         CmiPrintf("[%d] I am restarting  cur_restart_phase:%d at time: %f\n",CmiMyPe(), CpvAccess(_curRestartPhase), CkMemCheckPT::startTime);
         restartT = CmiWallTimer();
         CmiPrintf("[%d] I am restarting  cur_restart_phase:%d discard charm message at time: %f\n",CmiMyPe(), CpvAccess(_curRestartPhase), restartT);
         CmiPrintf("[%d] I am restarting  cur_restart_phase:%d at time: %f\n",CmiMyPe(), CpvAccess(_curRestartPhase), CkMemCheckPT::startTime);
         restartT = CmiWallTimer();
         CmiPrintf("[%d] I am restarting  cur_restart_phase:%d discard charm message at time: %f\n",CmiMyPe(), CpvAccess(_curRestartPhase), restartT);
+        fflush(stdout);
         char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
         *(int *)(msg+CmiMsgHeaderSizeBytes) = CpvAccess(_crashedNode);
         // cur_restart_phase = RESTART_PHASE_MAX;             // big enough to get it processed, moved to machine.c
         char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
         *(int *)(msg+CmiMsgHeaderSizeBytes) = CpvAccess(_crashedNode);
         // cur_restart_phase = RESTART_PHASE_MAX;             // big enough to get it processed, moved to machine.c
@@ -2066,6 +2064,8 @@ void CkMemCheckPT::RollBack(){
           for(int i=0;i<CmiNumPes();i++){
             char * rMsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
             *(int *)(rMsg+CmiMsgHeaderSizeBytes) = buddy;
           for(int i=0;i<CmiNumPes();i++){
             char * rMsg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes+sizeof(int));
             *(int *)(rMsg+CmiMsgHeaderSizeBytes) = buddy;
+            //CmiPrintf("[%d][%d] send to processor %d in replica. \n",CmiMyPartition(), CmiMyPe(), i);
+            //fflush(stdout);
             CmiSetHandler(rMsg, replicaDieHandlerIdx);
             CmiRemoteSyncSendAndFree(i,CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(int),(char *)rMsg);
           }
             CmiSetHandler(rMsg, replicaDieHandlerIdx);
             CmiRemoteSyncSendAndFree(i,CmiMyPartition()^1,CmiMsgHeaderSizeBytes+sizeof(int),(char *)rMsg);
           }
index 639b664fa1331bd21bc442d1c268ffb8362ca749..d2bf006f61e89cbf86fc3d23041440e3d9ed136f 100644 (file)
@@ -730,10 +730,10 @@ void CthSuspend(void)
 
 void CthAwaken(CthThread th)
 {
 
 void CthAwaken(CthThread th)
 {
-  if(CmiMyPartition()==1&&CmiMyPe()==1){
+//  if(CmiMyPartition()==1&&CmiMyPe()==1){
 //    CmiPrintf("start awake thread\n");
 //    fflush(stdout);
 //    CmiPrintf("start awake thread\n");
 //    fflush(stdout);
-  }
+//  }
   if (B(th)->awakenfn == 0) CthNoStrategy();
 
   /*BIGSIM_OOC DEBUGGING
   if (B(th)->awakenfn == 0) CthNoStrategy();
 
   /*BIGSIM_OOC DEBUGGING
@@ -752,10 +752,10 @@ void CthAwaken(CthThread th)
   /*B(th)->scheduled = 1; */
   /*changed due to out-of-core emulation in BigSim */
   B(th)->scheduled++;
   /*B(th)->scheduled = 1; */
   /*changed due to out-of-core emulation in BigSim */
   B(th)->scheduled++;
-  if(CmiMyPartition()==1&&CmiMyPe()==1){
+//  if(CmiMyPartition()==1&&CmiMyPe()==1){
 //    CmiPrintf("end awake thread\n");
 //    fflush(stdout);
 //    CmiPrintf("end awake thread\n");
 //    fflush(stdout);
-  }
+//  }
 }
 
 void CthYield()
 }
 
 void CthYield()
@@ -1619,15 +1619,15 @@ void CthResume(t)
 
   if (t != tc) { /* Actually switch threads */
     CthBaseResume(t);
 
   if (t != tc) { /* Actually switch threads */
     CthBaseResume(t);
-    if(CmiMyPartition()==0&& CmiMyPe()==1){
+//    if(CmiMyPartition()==0&& CmiMyPe()==1){
 //        CmiPrintf("[%d][%d]In end CthBaseResume \n",CmiMyPartition(),CmiMyPe());
 //        fflush(stdout);
 //        CmiPrintf("[%d][%d]In end exiting %d\n",CmiMyPartition(),CmiMyPe(),tc->base.exiting);
 //        fflush(stdout);
 //        CmiPrintf("[%d][%d]In end CthBaseResume \n",CmiMyPartition(),CmiMyPe());
 //        fflush(stdout);
 //        CmiPrintf("[%d][%d]In end exiting %d\n",CmiMyPartition(),CmiMyPe(),tc->base.exiting);
 //        fflush(stdout);
-    }
+//    }
     if (!tc->base.exiting) 
     {
     if (!tc->base.exiting) 
     {
-      if(CmiMyPartition()==0&& CmiMyPe()==1)
+//      if(CmiMyPartition()==0&& CmiMyPe()==1)
 //        CmiPrintf("[%d][%d]In CthResume swap context\n",CmiMyPartition(),CmiMyPe());
 //      fflush(stdout);
       if (0 != swapJcontext(&tc->context, &t->context)) 
 //        CmiPrintf("[%d][%d]In CthResume swap context\n",CmiMyPartition(),CmiMyPe());
 //      fflush(stdout);
       if (0 != swapJcontext(&tc->context, &t->context)) 
index b5821827c864d10face6a71b2053183af8d434dd..b45ac6573d2948c834362cdb23580113b1a5e283 100644 (file)
@@ -879,8 +879,8 @@ class ampiWorlds : public CBase_ampiWorlds {
 
   thread->semaPut(AMPI_BARRIER_SEMAID,&barrier);
   AsyncEvacuate(CmiFalse);
 
   thread->semaPut(AMPI_BARRIER_SEMAID,&barrier);
   AsyncEvacuate(CmiFalse);
-  CkCallback cb(CkIndex_ampiParent::ResumeThread(),thisProxy(thisIndex));
-  setChkpResumeClient(cb);
+  //CkCallback cb(CkIndex_ampiParent::ResumeThread(),thisProxy(thisIndex));
+  //setChkpResumeClient(cb);
 }
 
 ampiParent::ampiParent(CkMigrateMessage *msg):CBase_ampiParent(msg) {
 }
 
 ampiParent::ampiParent(CkMigrateMessage *msg):CBase_ampiParent(msg) {
@@ -924,8 +924,8 @@ void ampiParent::pup(PUP::er &p) {
 
   p|ampiInitCallDone;
   if(p.isUnpacking()&&CkInRestarting()){
 
   p|ampiInitCallDone;
   if(p.isUnpacking()&&CkInRestarting()){
-    CkCallback cb(CkIndex_ampiParent::ResumeThread(),thisProxy(thisIndex));
-    setChkpResumeClient(cb);
+    //CkCallback cb(CkIndex_ampiParent::ResumeThread(),thisProxy(thisIndex));
+    //setChkpResumeClient(cb);
   }
 }
 void ampiParent::prepareCtv(void) {
   }
 }
 void ampiParent::prepareCtv(void) {
@@ -1422,12 +1422,6 @@ void ampi::pup(PUP::er &p)
 
   posted_ireqs = CmmPup((pup_er)&p, posted_ireqs, cmm_pup_posted_ireq);
 
 
   posted_ireqs = CmmPup((pup_er)&p, posted_ireqs, cmm_pup_posted_ireq);
 
-  if(p.isUnpacking()){
-    if(CmiMyPartition()==0)
-      CmiPrintf("ampi[%d]::unpacking: posted_ireqs: %p with %d\n", thisIndex, posted_ireqs, CmmEntries(posted_ireqs));
-    fflush(stdout);
-  }
-
   p|seqEntries;
   p|oorder;
 }
   p|seqEntries;
   p|oorder;
 }
index 184721eafddb1b6fb8ca9a43316a1a44babd83b7..5bf49e5ad6fce9464815fcc1f23887b89c17d07d 100644 (file)
@@ -196,8 +196,8 @@ TCharm::TCharm(TCharmInitMsg *initMsg_)
   nUd=0;
   usesAtSync=CmiTrue;
   run();
   nUd=0;
   usesAtSync=CmiTrue;
   run();
-  //CkCallback cb(CkIndex_TCharm::ResumeFromChkpSync(),thisProxy(thisIndex));
-  //setChkpResumeClient(cb);
+  CkCallback cb(CkIndex_TCharm::ResumeFromChkpSync(),thisProxy(thisIndex));
+  setChkpResumeClient(cb);
 }
 
 TCharm::TCharm(CkMigrateMessage *msg)
 }
 
 TCharm::TCharm(CkMigrateMessage *msg)
@@ -231,8 +231,8 @@ void TCharm::pup(PUP::er &p) {
   //  CthPrintThdStack(tid);
   //}
   if(p.isUnpacking()&&CkInRestarting()){
   //  CthPrintThdStack(tid);
   //}
   if(p.isUnpacking()&&CkInRestarting()){
-//    CkCallback cb(CkIndex_TCharm::ResumeFromChkpSync(),thisProxy(thisIndex));
-//    setChkpResumeClient(cb);
+    CkCallback cb(CkIndex_TCharm::ResumeFromChkpSync(),thisProxy(thisIndex));
+    setChkpResumeClient(cb);
   }
   checkPupMismatch(p,5134,"before TCHARM");
 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
   }
   checkPupMismatch(p,5134,"before TCHARM");
 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
@@ -595,16 +595,7 @@ void TCharm::ResumeFromSync(void)
 
 void TCharm::ResumeFromChkpSync(void)
 {
 
 void TCharm::ResumeFromChkpSync(void)
 {
-  if(!isStopped){
-    CmiPrintf("[%d][%d]haven't stopped\n",CmiMyPartition(),CkMyPe());
-    fflush(stdout);
-  }
-//TCharm::get()->stop();
   start();
   start();
-  if(CmiMyPartition()==1){
-//    CmiPrintf("[%d][%d]ResumeFromChkpSync\n",CmiMyPartition(),CkMyPe());
-//    fflush(stdout);
-  }
 }
 
 /****** TcharmClient ******/
 }
 
 /****** TcharmClient ******/