Revert "changes to make smp restart work"
authorPhil Miller <mille121@illinois.edu>
Tue, 1 Feb 2011 06:55:38 +0000 (00:55 -0600)
committerPhil Miller <mille121@illinois.edu>
Tue, 1 Feb 2011 06:55:38 +0000 (00:55 -0600)
Undo conglomeration of sutff, one piece of which was the intended change.

This reverts commit 6d419d67c59ae54bc412a77a5961674862f82c5e.

Conflicts:

src/ck-core/qd.C

examples/charm++/hello/1darray/hello.C
examples/charm++/jacobi2d/jacobi2d.C
examples/charm++/load_balancing/kNeighbor/kNeighbor.C
examples/multiphaseSharedArrays/simpletest/t3.C
src/arch/net/charmrun/charmrun.c
src/ck-core/ckmemcheckpoint.C
src/ck-core/init.C
src/ck-core/qd.C
src/conv-core/convcore.c
src/conv-core/converse.h

index 3c181bbae83f7804518cb92c953168824c2bed56..a983c5f3eaaa2d82da1a044704150236074dcda4 100644 (file)
@@ -3,7 +3,7 @@
 
 /*readonly*/ CProxy_Main mainProxy;
 /*readonly*/ int nElements;
-int change=1;
+
 /*mainchare*/
 class Main : public CBase_Main
 {
@@ -21,7 +21,7 @@ public:
     mainProxy = thisProxy;
 
     CProxy_Hello arr = CProxy_Hello::ckNew(nElements);
-       change=1;
+
     arr[0].SayHi(17);
   };
 
@@ -45,8 +45,7 @@ public:
   
   void SayHi(int hiNo)
   {
-         change++;
-    CkPrintf("Hi[%d] from element %d %d\n",hiNo,thisIndex,change);
+    CkPrintf("Hi[%d] from element %d\n",hiNo,thisIndex);
     if (thisIndex < nElements-1)
       //Pass the hello on:
       thisProxy[thisIndex+1].SayHi(hiNo+1);
index 1a9e2fd809fff86bb2b540a4151f7d02dd5c000c..4d605a7780c236603ea7e2bcc1265ec89484b71d 100644 (file)
@@ -27,8 +27,7 @@ int num_rows;
 int num_cols;
 
 //allowed variation between temp and new_temp
-//float epsilon=1./1000;
-float epsilon=0;
+float epsilon=1./1000;
 
 //temperatures on the various boundries
 float left = 1;
index 4d22e595cb37794d1762a47e2140ee6cdc2a213e..2de665f9656c8f2fc721381f01c00346705e4c03 100644 (file)
@@ -86,7 +86,7 @@ class Main: public CBase_Main {
       mainProxy = thisProxy;
       CkPrintf("\nStarting kNeighbor ...\n");
 
-      if (m->argc!=4 || m->argc!=5) {
+      if (m->argc!=4 && m->argc!=5) {
         CkPrintf("Usage: %s <#elements> <#iterations> <msg size> [ldb freq]\n", m->argv[0]);
         delete m;
         CkExit();
index 4d698e4e72f231ac03f3e3b3be2fa3eda43a5283..5c4e7940c43e20a9f463cdc996e1c02611b92832 100644 (file)
@@ -33,9 +33,9 @@ public:
     t3(CkArgMsg* m)
     {
         // Usage: t3 [number_of_worker_threads [max_bytes]]
-        /*if(m->argc >1 ) NUM_WORKERS=atoi(m->argv[1]);
-        if(m->argc >2 ) bytes=atoi(m->argv[2]);
-        delete m;*/
+        if(m->argc >1 ) NUM_WORKERS=atoi(m->argv[1]);
+        if(m->argc >2 ) bytes=atoi(m->argv[1]);
+        delete m;
         reallyDone = 0;
 
         // Actually build the shared array.
@@ -64,8 +64,7 @@ public:
               << NUM_WORKERS << TAB
               << bytes << TAB
               << ((g_prefetch == 0) ? "N" : ((g_prefetch == 1) ? "Y" : "U")) << TAB
-              <<g_prefetch<<TAB
-                         << end_time - start_time
+              << end_time - start_time
               << endl;
         } else {
 
index 6ba330dea4ae2b2c575bccb80320cb168569843f..ab3831fd21287b380dcf1c3bc176f9a5a20db1ca 100644 (file)
@@ -2345,7 +2345,7 @@ void req_forward_client()
 #ifdef __FAULT__
        if(strcmp(cmd, "initnodetab") ==0){
                if(_last_crash ==0 ) 
-                       current_restart_phase++;
+                       cur_restart_phase++;
                int i;
                for (i=0;i<req_nClients;i++)
                        if(_last_crash==0 || i !=_crash_socket_index)
@@ -4630,7 +4630,7 @@ void start_nodes_local(char ** env)
 
 #ifdef __FAULT__
 
-int current_restart_phase = 1;
+int cur_restart_phase = 1;
 
 void refill_nodetab_entry(int crashed_node);
 nodetab_host *replacement_host(int pe);
@@ -4665,7 +4665,7 @@ void restart_node(int crashed_node){
                i++;
        }
        restart_argv[i] = "+restartaftercrash";
-        sprintf(phase_str,"%d", ++current_restart_phase);
+        sprintf(phase_str,"%d", ++cur_restart_phase);
        restart_argv[i+1]=phase_str;
        restart_argv[i+2]=NULL;
 
index 8dfdf1e14f40aefdfadb1d461ce909b9533cc072..7c05c73c190d9c7cf427e674f145fa99f0e65653 100644 (file)
@@ -1015,7 +1015,7 @@ static void restartBcastHandler(char *msg)
   //if (CkMyPe() != _diePE) cur_restart_phase ++;
 
   if (CkMyPe()==_diePE)
-    CkPrintf("[%d] restartBcastHandler cur_restart_phase=%d _diePE:%d at %f.\n", CkMyPe(), CpvAccess(_curRestartPhase), _diePE, CkWallTimer());
+    CkPrintf("[%d] restartBcastHandler cur_restart_phase=%d _diePE:%d at %f.\n", CkMyPe(), cur_restart_phase, _diePE, CkWallTimer());
 
   // reset QD counters
 /*  gzheng
@@ -1047,8 +1047,8 @@ static void recoverProcDataHandler(char *msg)
    envelope *env = (envelope *)msg;
    CkUnpackMessage(&env);
    CkProcCheckPTMessage* procMsg = (CkProcCheckPTMessage *)(EnvToUsr(env));
-   CpvAccess(_curRestartPhase) = procMsg->cur_restart_phase;
-   CmiPrintf("[%d] ----- recoverProcDataHandler  cur_restart_phase:%d at time: %f\n", CkMyPe(), CpvAccess(_curRestartPhase), CkWallTimer());
+   cur_restart_phase = procMsg->cur_restart_phase;
+   CmiPrintf("[%d] ----- recoverProcDataHandler  cur_restart_phase:%d at time: %f\n", CkMyPe(), cur_restart_phase, CkWallTimer());
    //cur_restart_phase ++;
      // gzheng ?
    //CpvAccess(_qd)->flushStates();
@@ -1080,14 +1080,14 @@ static void askProcDataHandler(char *msg)
 {
 #if CMK_MEM_CHECKPOINT
     int diePe = *(int *)(msg+CmiMsgHeaderSizeBytes);
-    CkPrintf("[%d] restartBcastHandler called with '%d' cur_restart_phase:%d at time %f.\n",CmiMyPe(),diePe, CpvAccess(_curRestartPhase), CkWallTimer());
+    CkPrintf("[%d] restartBcastHandler called with '%d' cur_restart_phase:%d at time %f.\n",CmiMyPe(),diePe, cur_restart_phase, CkWallTimer());
     if (CpvAccess(procChkptBuf) == NULL) 
       CkPrintf("[%d] no checkpoint found for processor %d. This could be due to a crash before the first checkpointing.\n", CkMyPe(), diePe);
     CmiAssert(CpvAccess(procChkptBuf)!=NULL);
     envelope *env = (envelope *)(UsrToEnv(CpvAccess(procChkptBuf)));
     CmiAssert(CpvAccess(procChkptBuf)->pe == diePe);
 
-    CpvAccess(procChkptBuf)->cur_restart_phase = CpvAccess(_curRestartPhase);
+    CpvAccess(procChkptBuf)->cur_restart_phase = cur_restart_phase;
 
     CkPackMessage(&env);
     CmiSetHandler(env, recoverProcDataHandlerIdx);
@@ -1126,17 +1126,16 @@ void CkMemRestart(const char *dummy, CkArgMsg *args)
 #if CMK_MEM_CHECKPOINT
    _diePE = CmiMyNode();
    CkMemCheckPT::startTime = restartT = CmiWallTimer();
-   CmiPrintf("[%d] I am restarting  cur_restart_phase:%d at time: %f\n",CmiMyPe(), CpvAccess(_curRestartPhase), CkMemCheckPT::startTime);
+   CmiPrintf("[%d] I am restarting  cur_restart_phase:%d at time: %f\n",CmiMyPe(), cur_restart_phase, CkMemCheckPT::startTime);
    CkMemCheckPT::inRestarting = 1;
 
   CpvAccess( _crashedNode )= CmiMyNode();
        
   _discard_charm_message();
-  
-if(CmiMyRank()==0){
+ if(CmiMyRank()==0){
+   CkPrintf("crash_node:%d\n",CpvAccess( _crashedNode));
    CkCallback cb(qd_callback);
    CkStartQD(cb);
-   CkPrintf("crash_node:%d\n",CpvAccess( _crashedNode));
  }
 #else
    CmiAbort("Fault tolerance is not support, rebuild charm++ with 'syncft' option");
@@ -1197,7 +1196,7 @@ static void notifyHandler(char *msg)
 #if CMK_MEM_CHECKPOINT
   CmiFree(msg);
       /* immediately increase restart phase to filter old messages */
-  CpvAccess(_curRestartPhase) ++;
+  cur_restart_phase ++;
   CpvAccess(_qd)->flushStates();
   _discard_charm_message();
 #endif
@@ -1208,15 +1207,11 @@ void notify_crash(int node)
 {
 #ifdef CMK_MEM_CHECKPOINT
   CpvAccess( _crashedNode) = node;
-#ifdef CMK_SMP
-  for(int i=0;i<CkMyNodeSize();i++){
-       CpvAccessOther(_crashedNode,i)=node;
-  }
-#endif
   CmiAssert(CmiMyNode() !=CpvAccess( _crashedNode));
   CkMemCheckPT::inRestarting = 1;
 
 #ifdef CMK_SMP
+//     CkPrintf("%d %d notify crash\n",CkMyPe(), CmiMyNode()); 
   for(int i=0;i<CkMyNodeSize();i++){
        char *msg = (char*)CmiAlloc(CmiMsgHeaderSizeBytes);
        CmiSetHandler(msg, notifyHandlerIdx);
index 2f52d4924597073c1533faa66698a62529dc0477..ba3d4bffd5e91a1314c386e3081cc0da794b8952 100644 (file)
@@ -248,7 +248,7 @@ static inline void _parseCommandLineOpts(char **argv)
   if(CmiGetArgString(argv,"+restart",&_restartDir))
       faultFunc = CkRestartMain;
 #if __FAULT__
-  if (CmiGetArgIntDesc(argv,"+restartaftercrash",&CpvAccess(_curRestartPhase),"restarting this processor after a crash")){     
+  if (CmiGetArgIntDesc(argv,"+restartaftercrash",&cur_restart_phase,"restarting this processor after a crash")){       
 # if CMK_MEM_CHECKPOINT
       faultFunc = CkMemRestart;
 # endif
index 0b6143342b172bbbc7898f30a3d82fa062a294ee..a21b41cb3996b5455e5ecb1956ae68208a5d57af 100644 (file)
@@ -5,7 +5,7 @@
  * $Revision$
  *****************************************************************************/
 
-#define  DEBUGP(x)     //CmiPrintf x;
+#define  DEBUGP(x)    // CmiPrintf x;
 
 #include "ck.h"
 
index 19eb2a9eece1dc33199964e8f68ef44cde77f924..e2e2d4b7f8a2ef316c295c03f4c3f3a058d47e8c 100644 (file)
@@ -129,8 +129,8 @@ extern void CldModuleInit(char **);
 
 #include "quiescence.h"
 
-//int cur_restart_phase = 1;      /* checkpointing/restarting phase counter */
-CpvDeclare(int,_curRestartPhase);
+int cur_restart_phase = 1;      /* checkpointing/restarting phase counter */
+
 static int CsdLocalMax = CSD_LOCAL_MAX_DEFAULT;
 
 CpvStaticDeclare(int, CmiMainHandlerIDP); /* Main handler for _CmiMultipleSend that is run on every node */
@@ -3229,8 +3229,7 @@ void ConverseCommonInit(char **argv)
 #if CMK_CCS_AVAILABLE
   CpvInitialize(int, cmiArgDebugFlag);
 #endif
-  CpvInitialize(int,_curRestartPhase);
-  CpvAccess(_curRestartPhase)=1;
+
   CmiInitCPUAffinityUtil();
   CmiArgInit(argv);
   CmiMemoryInit(argv);
index 10a643fc3a6abcb2e1ab107d987e087ed1b8d9ae..88e6c7ef2ca2570f97e90769af86d88e52f2d573 100644 (file)
@@ -608,22 +608,22 @@ extern void CmiNumberHandlerEx(int n, CmiHandlerEx h,void *userPtr);
 #define CmiGetHandlerFunction(env) (CmiHandlerToFunction(CmiGetHandler(env)))
 
 #if __FAULT__
-CpvExtern(int, _curRestartPhase);      /* number of restarts */
+extern int cur_restart_phase;      /* number of restarts */
 #endif
 
 #if CMK_MEM_CHECKPOINT
 #undef CmiSetHandler
-#define CmiSetHandler(m,v)  do {(((CmiMsgHeaderExt*)m)->hdl)=(v); (((CmiMsgHeaderExt*)m)->pn)=CpvAccess(_curRestartPhase);} while(0)
+#define CmiSetHandler(m,v)  do {(((CmiMsgHeaderExt*)m)->hdl)=(v); (((CmiMsgHeaderExt*)m)->pn)=cur_restart_phase;} while(0)
 #define MESSAGE_PHASE_CHECK(msg)       \
        {       \
           int phase = CmiGetRestartPhase(msg); \
-         if (phase != 9999 && phase < CpvAccess(_curRestartPhase)) {   \
+         if (phase != 9999 && phase < cur_restart_phase) {     \
             /* CmiPrintf("[%d] discard message of phase %d cur_restart_phase:%d. \n", CmiMyPe(), phase, cur_restart_phase); */ \
             CmiFree(msg);      \
            return;     \
           }    \
           /* CmiAssert(phase == cur_restart_phase || phase == 9999); */ \
-          if (phase > CpvAccess(_curRestartPhase) && phase != 9999) {    \
+          if (phase > cur_restart_phase && phase != 9999) {    \
             /* CmiPrintf("[%d] enqueue message of phase %d cur_restart_phase:%d. \n", CmiMyPe(), phase, cur_restart_phase); */ \
             CsdEnqueueFifo(msg);    \
            return;     \