FT: changes to make smp restart work
authorXiang Ni <xiangni2@illinois.edu>
Tue, 1 Feb 2011 05:48:44 +0000 (23:48 -0600)
committerPhil Miller <mille121@illinois.edu>
Tue, 1 Feb 2011 07:00:55 +0000 (01:00 -0600)
src/arch/net/charmrun/charmrun.c
src/ck-core/ckmemcheckpoint.C
src/ck-core/init.C
src/conv-core/convcore.c
src/conv-core/converse.h

index ab3831fd21287b380dcf1c3bc176f9a5a20db1ca..6ba330dea4ae2b2c575bccb80320cb168569843f 100644 (file)
@@ -2345,7 +2345,7 @@ void req_forward_client()
 #ifdef __FAULT__
        if(strcmp(cmd, "initnodetab") ==0){
                if(_last_crash ==0 ) 
-                       cur_restart_phase++;
+                       current_restart_phase++;
                int i;
                for (i=0;i<req_nClients;i++)
                        if(_last_crash==0 || i !=_crash_socket_index)
@@ -4630,7 +4630,7 @@ void start_nodes_local(char ** env)
 
 #ifdef __FAULT__
 
-int cur_restart_phase = 1;
+int current_restart_phase = 1;
 
 void refill_nodetab_entry(int crashed_node);
 nodetab_host *replacement_host(int pe);
@@ -4665,7 +4665,7 @@ void restart_node(int crashed_node){
                i++;
        }
        restart_argv[i] = "+restartaftercrash";
-        sprintf(phase_str,"%d", ++cur_restart_phase);
+        sprintf(phase_str,"%d", ++current_restart_phase);
        restart_argv[i+1]=phase_str;
        restart_argv[i+2]=NULL;
 
index 7c05c73c190d9c7cf427e674f145fa99f0e65653..8dfdf1e14f40aefdfadb1d461ce909b9533cc072 100644 (file)
@@ -1015,7 +1015,7 @@ static void restartBcastHandler(char *msg)
   //if (CkMyPe() != _diePE) cur_restart_phase ++;
 
   if (CkMyPe()==_diePE)
-    CkPrintf("[%d] restartBcastHandler cur_restart_phase=%d _diePE:%d at %f.\n", CkMyPe(), cur_restart_phase, _diePE, CkWallTimer());
+    CkPrintf("[%d] restartBcastHandler cur_restart_phase=%d _diePE:%d at %f.\n", CkMyPe(), CpvAccess(_curRestartPhase), _diePE, CkWallTimer());
 
   // reset QD counters
 /*  gzheng
@@ -1047,8 +1047,8 @@ static void recoverProcDataHandler(char *msg)
    envelope *env = (envelope *)msg;
    CkUnpackMessage(&env);
    CkProcCheckPTMessage* procMsg = (CkProcCheckPTMessage *)(EnvToUsr(env));
-   cur_restart_phase = procMsg->cur_restart_phase;
-   CmiPrintf("[%d] ----- recoverProcDataHandler  cur_restart_phase:%d at time: %f\n", CkMyPe(), cur_restart_phase, CkWallTimer());
+   CpvAccess(_curRestartPhase) = procMsg->cur_restart_phase;
+   CmiPrintf("[%d] ----- recoverProcDataHandler  cur_restart_phase:%d at time: %f\n", CkMyPe(), CpvAccess(_curRestartPhase), CkWallTimer());
    //cur_restart_phase ++;
      // gzheng ?
    //CpvAccess(_qd)->flushStates();
@@ -1080,14 +1080,14 @@ static void askProcDataHandler(char *msg)
 {
 #if CMK_MEM_CHECKPOINT
     int diePe = *(int *)(msg+CmiMsgHeaderSizeBytes);
-    CkPrintf("[%d] restartBcastHandler called with '%d' cur_restart_phase:%d at time %f.\n",CmiMyPe(),diePe, cur_restart_phase, CkWallTimer());
+    CkPrintf("[%d] restartBcastHandler called with '%d' cur_restart_phase:%d at time %f.\n",CmiMyPe(),diePe, CpvAccess(_curRestartPhase), CkWallTimer());
     if (CpvAccess(procChkptBuf) == NULL) 
       CkPrintf("[%d] no checkpoint found for processor %d. This could be due to a crash before the first checkpointing.\n", CkMyPe(), diePe);
     CmiAssert(CpvAccess(procChkptBuf)!=NULL);
     envelope *env = (envelope *)(UsrToEnv(CpvAccess(procChkptBuf)));
     CmiAssert(CpvAccess(procChkptBuf)->pe == diePe);
 
-    CpvAccess(procChkptBuf)->cur_restart_phase = cur_restart_phase;
+    CpvAccess(procChkptBuf)->cur_restart_phase = CpvAccess(_curRestartPhase);
 
     CkPackMessage(&env);
     CmiSetHandler(env, recoverProcDataHandlerIdx);
@@ -1126,16 +1126,17 @@ void CkMemRestart(const char *dummy, CkArgMsg *args)
 #if CMK_MEM_CHECKPOINT
    _diePE = CmiMyNode();
    CkMemCheckPT::startTime = restartT = CmiWallTimer();
-   CmiPrintf("[%d] I am restarting  cur_restart_phase:%d at time: %f\n",CmiMyPe(), cur_restart_phase, CkMemCheckPT::startTime);
+   CmiPrintf("[%d] I am restarting  cur_restart_phase:%d at time: %f\n",CmiMyPe(), CpvAccess(_curRestartPhase), CkMemCheckPT::startTime);
    CkMemCheckPT::inRestarting = 1;
 
   CpvAccess( _crashedNode )= CmiMyNode();
        
   _discard_charm_message();
- if(CmiMyRank()==0){
-   CkPrintf("crash_node:%d\n",CpvAccess( _crashedNode));
+  
+if(CmiMyRank()==0){
    CkCallback cb(qd_callback);
    CkStartQD(cb);
+   CkPrintf("crash_node:%d\n",CpvAccess( _crashedNode));
  }
 #else
    CmiAbort("Fault tolerance is not support, rebuild charm++ with 'syncft' option");
@@ -1196,7 +1197,7 @@ static void notifyHandler(char *msg)
 #if CMK_MEM_CHECKPOINT
   CmiFree(msg);
       /* immediately increase restart phase to filter old messages */
-  cur_restart_phase ++;
+  CpvAccess(_curRestartPhase) ++;
   CpvAccess(_qd)->flushStates();
   _discard_charm_message();
 #endif
@@ -1207,11 +1208,15 @@ void notify_crash(int node)
 {
 #ifdef CMK_MEM_CHECKPOINT
   CpvAccess( _crashedNode) = node;
+#ifdef CMK_SMP
+  for(int i=0;i<CkMyNodeSize();i++){
+       CpvAccessOther(_crashedNode,i)=node;
+  }
+#endif
   CmiAssert(CmiMyNode() !=CpvAccess( _crashedNode));
   CkMemCheckPT::inRestarting = 1;
 
 #ifdef CMK_SMP
-//     CkPrintf("%d %d notify crash\n",CkMyPe(), CmiMyNode()); 
   for(int i=0;i<CkMyNodeSize();i++){
        char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
        CmiSetHandler(msg, notifyHandlerIdx);
index ba3d4bffd5e91a1314c386e3081cc0da794b8952..2f52d4924597073c1533faa66698a62529dc0477 100644 (file)
@@ -248,7 +248,7 @@ static inline void _parseCommandLineOpts(char **argv)
   if(CmiGetArgString(argv,"+restart",&_restartDir))
       faultFunc = CkRestartMain;
 #if __FAULT__
-  if (CmiGetArgIntDesc(argv,"+restartaftercrash",&cur_restart_phase,"restarting this processor after a crash")){       
+  if (CmiGetArgIntDesc(argv,"+restartaftercrash",&CpvAccess(_curRestartPhase),"restarting this processor after a crash")){     
 # if CMK_MEM_CHECKPOINT
       faultFunc = CkMemRestart;
 # endif
index e2e2d4b7f8a2ef316c295c03f4c3f3a058d47e8c..19eb2a9eece1dc33199964e8f68ef44cde77f924 100644 (file)
@@ -129,8 +129,8 @@ extern void CldModuleInit(char **);
 
 #include "quiescence.h"
 
-int cur_restart_phase = 1;      /* checkpointing/restarting phase counter */
-
+//int cur_restart_phase = 1;      /* checkpointing/restarting phase counter */
+CpvDeclare(int,_curRestartPhase);
 static int CsdLocalMax = CSD_LOCAL_MAX_DEFAULT;
 
 CpvStaticDeclare(int, CmiMainHandlerIDP); /* Main handler for _CmiMultipleSend that is run on every node */
@@ -3229,7 +3229,8 @@ void ConverseCommonInit(char **argv)
 #if CMK_CCS_AVAILABLE
   CpvInitialize(int, cmiArgDebugFlag);
 #endif
-
+  CpvInitialize(int,_curRestartPhase);
+  CpvAccess(_curRestartPhase)=1;
   CmiInitCPUAffinityUtil();
   CmiArgInit(argv);
   CmiMemoryInit(argv);
index 88e6c7ef2ca2570f97e90769af86d88e52f2d573..10a643fc3a6abcb2e1ab107d987e087ed1b8d9ae 100644 (file)
@@ -608,22 +608,22 @@ extern void CmiNumberHandlerEx(int n, CmiHandlerEx h,void *userPtr);
 #define CmiGetHandlerFunction(env) (CmiHandlerToFunction(CmiGetHandler(env)))
 
 #if __FAULT__
-extern int cur_restart_phase;      /* number of restarts */
+CpvExtern(int, _curRestartPhase);      /* number of restarts */
 #endif
 
 #if CMK_MEM_CHECKPOINT
 #undef CmiSetHandler
-#define CmiSetHandler(m,v)  do {(((CmiMsgHeaderExt*)m)->hdl)=(v); (((CmiMsgHeaderExt*)m)->pn)=cur_restart_phase;} while(0)
+#define CmiSetHandler(m,v)  do {(((CmiMsgHeaderExt*)m)->hdl)=(v); (((CmiMsgHeaderExt*)m)->pn)=CpvAccess(_curRestartPhase);} while(0)
 #define MESSAGE_PHASE_CHECK(msg)       \
        {       \
           int phase = CmiGetRestartPhase(msg); \
-         if (phase != 9999 && phase < cur_restart_phase) {     \
+         if (phase != 9999 && phase < CpvAccess(_curRestartPhase)) {   \
             /* CmiPrintf("[%d] discard message of phase %d cur_restart_phase:%d. \n", CmiMyPe(), phase, cur_restart_phase); */ \
             CmiFree(msg);      \
            return;     \
           }    \
           /* CmiAssert(phase == cur_restart_phase || phase == 9999); */ \
-          if (phase > cur_restart_phase && phase != 9999) {    \
+          if (phase > CpvAccess(_curRestartPhase) && phase != 9999) {    \
             /* CmiPrintf("[%d] enqueue message of phase %d cur_restart_phase:%d. \n", CmiMyPe(), phase, cur_restart_phase); */ \
             CsdEnqueueFifo(msg);    \
            return;     \