Interop bug fixes at exit 13/913/7
authorNikhil Jain <nikhil.jain@acm.org>
Sat, 14 Nov 2015 19:14:16 +0000 (13:14 -0600)
committerEvan Ramos <evan@hpccharm.com>
Tue, 23 Apr 2019 02:42:51 +0000 (21:42 -0500)
When exit is called, the scheduler should not be stopped immediately since
pending messages, including the exit broadcast, need to be sent out first.
This change relies on LrtsDrainResources to handle this.

Co-authored-by: Eric Mikida <epmikida@hpccharm.com>
Change-Id: I92f81f9ab2d14b89bb05a627a6c7b9764e1e16cf

src/arch/util/machine-common-core.C
src/arch/util/machine-smp.c
src/ck-core/mpi-interoperate.C
src/conv-core/conv-interoperate.C

index 57e22cf6ddf1141f4bf8dee803b2918ba2a2260a..07e1262fbb81dc71d2bf082b612f5b0c8b18a1de 100644 (file)
@@ -199,8 +199,12 @@ void CmiSuspendedTaskEnqueue(int targetRank, void *data);
 void* CmiSuspendedTaskPop();
 #endif
 
-#if CMK_SMP
 #include <atomic>
+
+extern int CharmLibInterOperate;
+std::atomic<int> ckExitComplete {0};
+
+#if CMK_SMP
 std::atomic<int> commThdExit {0};
 
 /**
@@ -1661,6 +1665,10 @@ static void CommunicationServer(int sleepTime) {
 #endif
         CmiNodeAllBarrier();
         LrtsExit(_exitcode);
+        if(CharmLibInterOperate) {
+          ckExitComplete = 1;
+          CmiNodeAllBarrier();
+        }
     }
 #endif
 }
index b4a7f804e2d67c07c871262f736cfb7f046ffca6..37f65f5b13cd88cc7f9fbf6e32f72f8aa535eee5 100644 (file)
@@ -433,9 +433,10 @@ static void *call_startfn(void *vindex)
         CmiNodeAllBarrier();
       } else {
         if (CmiMyRank() == CmiMyNodeSize()) {
-          while (1) { CommunicationServerThread(5); }
+          while (ckExitComplete.load() == 0) { CommunicationServerThread(5); }
         } else { 
           CsdScheduler(-1);
+          CmiNodeAllBarrier();
         }
         break;
       }
index 36aa1a8d8234602979b48064a83c69deeaa7cfda..bd88ea3b67d828cee628cdd28d67609c74eff11d 100644 (file)
@@ -6,9 +6,11 @@
 #define DEBUG(a) 
 #endif
 
-static bool   _libExitStarted = false;
-int    _libExitHandlerIdx;
-extern "C" int _cleanUp;
+int _libExitHandlerIdx;
+static bool _libExitStarted = false;
+
+extern std::atomic<int> ckExitComplete;
+extern std::atomic<int> _cleanUp;
 
 #if CMK_CONVERSE_MPI
 extern MPI_Comm charmComm;
@@ -16,6 +18,12 @@ extern MPI_Comm charmComm;
 typedef int MPI_Comm;
 #endif
 
+#if CMK_USE_LRTS
+extern void LrtsDrainResources(); /* used when exit */
+#else
+void LrtsDrainResources() { }
+#endif
+
 extern bool _ringexit;             // for charm exit
 extern int _ringtoken;
 extern void _initCharm(int unused_argc, char **argv);
@@ -78,7 +86,7 @@ void _libExitHandler(envelope *env)
       }else{
         DEBUG(printf("[%d] Broadcast Exit for %d PE %d nodes\n",CmiMyPe(),CmiNumPes(),CmiNumNodes());)
         CmiSyncBroadcastAllAndFree(env->getTotalsize(), (char *)env);
-      }        
+      }
       break;
     case ReqStatMsg:
       DEBUG(printf("[%d] Receive Exit for %d PE %d nodes\n",CmiMyPe(),CmiNumPes(),CmiNumNodes());)
@@ -93,7 +101,10 @@ void _libExitHandler(envelope *env)
       else
         CmiFree(env);
       //everyone exits here - there may be issues with leftover messages in the queue
-      DEBUG(printf("[%d] Am done here\n",CmiMyPe());)
+      DEBUG(printf("[%d/%d] Am done here\n",CmiMyRank(),CmiMyPe());)
+#if !CMK_SMP
+      LrtsDrainResources();
+#endif
       _libExitStarted = false;
       StopCharmScheduler();
       break;
@@ -176,8 +187,9 @@ void CharmLibExit() {
     CkExit();
   }
   if (CmiMyRank() == CmiMyNodeSize()) {
-    while (1) { CommunicationServerThread(5); }
+    while (ckExitComplete.load() == 0) { CommunicationServerThread(5); }
   } else { 
     CsdScheduler(-1);
+    CmiNodeAllBarrier();
   }
 }
index 42426f5eaffbdae9b3ab346c4bae9c2d312731ab..cdcc0da1248b286341db8a437b77f8466e3a380f 100644 (file)
 #error "Configure should have errored on missing C++11 atomic library support"
 #endif
 
+#if CMK_USE_LRTS
+extern void LrtsDrainResources();
+#else
+void LrtsDrainResources() { }
+#endif
+
 static std::atomic<int> interopCommThdExit{0};
+std::atomic<int> _cleanUp{0};
 
 CpvCExtern(int,interopExitFlag);
 
 extern "C"
 {
-  int _cleanUp = 0;
 
 #if CMK_USE_LRTS
   extern void CommunicationServerThread(int sleepTime);
@@ -39,10 +45,12 @@ extern "C"
         CommunicationServerThread(5);
       }
       DEBUG(printf("[%d] Commthread Exit Scheduler\n",CmiMyPe()););
+      LrtsDrainResources();
       interopCommThdExit = 0;
     } else {
       CsdScheduler(-1);
     }
+    CmiNodeAllBarrier();
   }
 
   void StopInteropScheduler() {