Bug #1947: Fix mpi-win-x86_64-smp hang for nodegroup zerocopy api in pingpong 50/4450/4
authorNitin Bhat <nbhat4@illinois.edu>
Mon, 6 Aug 2018 16:34:32 +0000 (11:34 -0500)
committerNitin Bhat <nbhat4@illinois.edu>
Tue, 7 Aug 2018 14:42:49 +0000 (09:42 -0500)
The hang was because MPI was initialized in MPI_THREAD_SINGLE mode,
but instead of the comm. thread, the worker thread was posting the
recv buffer. The fix was to have the comm thread post the buffer instead of
the worker thread.

This commit also includes another fix related to the zerocopy api used
for nodegroups. For nodegroups, the pingpong example crashed for
mpi-win-x86_64-smp when run on 1 PE because of a stack overflow. This
was due to inlining of the send method causing recursive calls which
caused stack overflow when run with a large number of iterations (~900).
The fix was to not use inlining in this code path.

Change-Id: I1c86dbac2ce81832d5f728aadf769fe9e49cf9f7

src/arch/mpi/machine-onesided.c
src/arch/mpi/machine-onesided.h
src/arch/mpi/machine.C
src/ck-core/ck.C
src/ck-core/ckrdma.C

index ca2ba800f575926a57358448a6b759eabdf144dd..22c2ff865bf63c518b7b391fae75866fd51f4707 100644 (file)
@@ -8,25 +8,15 @@ void LrtsIssueRgets(void *recv, int pe){
   int i;
   CmiMPIRzvRdmaRecvList_t *recvInfo = (CmiMPIRzvRdmaRecvList_t *)recv;
   MPI_Request reqBufferRecv;
-  int srcRank = recvInfo->srcRank;
+  int srcPe = recvInfo->srcPe;
 
   for(i=0; i<recvInfo->numOps; i++){
     void *buffer = recvInfo->rdmaOp[i].buffer;
     int size = recvInfo->rdmaOp[i].size;
     int srcTag = recvInfo->rdmaOp[i].tag;
 
-    if(MPI_SUCCESS != MPI_Irecv(buffer, size, MPI_BYTE, srcRank, srcTag, charmComm, &reqBufferRecv))
-      CmiAbort("LrtsIssueRgets: MPI_Irecv failed!\n");
-    recvInfo->rdmaOp[i].req = reqBufferRecv;
+    MPIPostOneBuffer(buffer, (char *)(&(recvInfo->rdmaOp[i])), size, srcPe, srcTag, ONESIDED_BUFFER_RECV);  
   }
-
-  //Add receiver's information to the list to wait on it for completion
-  CpvAccess(RdmaRecvQueueLen)++;
-  if (CpvAccess(recvRdmaBuffers)==0)
-    CpvAccess(recvRdmaBuffers) = recvInfo;
-  else
-    CpvAccess(endRdmaBuffer)->next = recvInfo;
-  CpvAccess(endRdmaBuffer) = recvInfo;
 }
 
 // Post MPI_Isend or MPI_Irecv to send/recv the RDMA buffer
@@ -45,10 +35,10 @@ void MPISendOrRecvOneBuffer(SMSG_LIST *smsg, int tag){
   dstrank=node;
 #endif
 
-  if(smsg->type == ONESIDED_BUFFER_DIRECT_SEND || smsg->type == ONESIDED_BUFFER) {
+  if(smsg->type == ONESIDED_BUFFER_DIRECT_SEND || smsg->type == ONESIDED_BUFFER_SEND) {
     if (MPI_SUCCESS != MPI_Isend((void *)msg, size, MPI_BYTE, dstrank, tag, charmComm, &(smsg->req)))
       CmiAbort("LrtsSendBuffer: MPI_Isend failed!\n");
-  } else if(smsg->type == ONESIDED_BUFFER_DIRECT_RECV) {
+  } else if(smsg->type == ONESIDED_BUFFER_DIRECT_RECV || smsg->type == ONESIDED_BUFFER_RECV) {
     if (MPI_SUCCESS != MPI_Irecv((void *)msg, size, MPI_BYTE, dstrank, tag, charmComm, &(smsg->req)))
       CmiAbort("LrtsSendBuffer: MPI_Irecv failed!\n");
   } else {
index 87b505fb655f0592219f95c2be4b20c1c3fe2704..a5b48b4b45389491b64bb732c01f34240c9c231b 100644 (file)
@@ -6,7 +6,7 @@ typedef struct _cmi_mpi_rzv_rdma_op{
 
 typedef struct _cmi_mpi_rzv_rdma{
   int numOps;
-  int srcRank;
+  int srcPe;
   CmiMPIRzvRdmaOp_t rdmaOp[0];
 } CmiMPIRzvRdma_t;
 
@@ -15,17 +15,17 @@ typedef struct _cmi_mpi_rzv_rdma_recv_op {
   void *src_info;
   int size;
   int tag;
-  int hasCompleted;
+  int opIndex;
   MPI_Request req;
 } CmiMPIRzvRdmaRecvOp_t;
 
 //Receiver's rdma buffer information is stored as a list to wait for completion of the recv requests
 typedef struct _cmi_mpi_rzv_rdma_recv_list {
-  int srcRank;
+  int srcPe;
   int numOps;
+  int counter;
   int msgLen;
   void *msg;
-  struct _cmi_mpi_rzv_rdma_recv_list* next;
   CmiMPIRzvRdmaRecvOp_t rdmaOp[0];
 } CmiMPIRzvRdmaRecvList_t;
 
@@ -96,9 +96,9 @@ void LrtsSetRdmaRecvInfo(void *rdmaRecv, int numOps, void *msg, void *rdmaSend,
   CmiMPIRzvRdma_t *rdmaSendInfo = (CmiMPIRzvRdma_t *)rdmaSend;
 
   rdmaRecvInfo->numOps = numOps;
-  rdmaRecvInfo->srcRank = rdmaSendInfo->srcRank;
+  rdmaRecvInfo->counter = 0;
+  rdmaRecvInfo->srcPe = rdmaSendInfo->srcPe;
   rdmaRecvInfo->msg = msg;
-  rdmaRecvInfo->next = 0;
   rdmaRecvInfo->msgLen = msgSize;
 }
 
@@ -110,16 +110,14 @@ void LrtsSetRdmaRecvOpInfo(void *rdmaRecvOp, void *buffer, void *src_ref, int si
   rdmaRecvOpInfo->size = size;
   rdmaRecvOpInfo->src_info = src_ref;
 
+  rdmaRecvOpInfo->opIndex = opIndex;
   rdmaRecvOpInfo->tag = rdmaSendInfo->rdmaOp[opIndex].tag;
-  rdmaRecvOpInfo->hasCompleted = 0;
 }
 
 void LrtsSetRdmaInfo(void *dest, int destPE, int numOps){
   CmiMPIRzvRdma_t *rdma = (CmiMPIRzvRdma_t *)dest;
   rdma->numOps = numOps;
-  /* srcRank is a global variable that stores the sender rank as we cannot
-   * call MPI_Comm_rank from this thread as it is worker thread */
-  rdma->srcRank = srcRank;
+  rdma->srcPe = CmiMyPe();
 }
 
 void LrtsSetRdmaOpInfo(void *dest, const void *ptr, int size, void *ack, int destPE){
@@ -134,7 +132,7 @@ void LrtsSetRdmaOpInfo(void *dest, const void *ptr, int size, void *ack, int des
   rdmaOpInfo->tag = rdmaOp->tag;
 
   // Post the RDMA buffer with the generated tag using MPI_Isend. Post MPI_Isend directly for non-smp or through the comm thread for smp mode
-  MPIPostOneBuffer(ptr, (void *)rdmaOpInfo, size, destPE, rdmaOp->tag, ONESIDED_BUFFER);
+  MPIPostOneBuffer(ptr, (void *)rdmaOpInfo, size, destPE, rdmaOp->tag, ONESIDED_BUFFER_SEND);
 }
 
 // Structure used for the Nocopy Direct API to request an MPI rank to post a buffer
index cc8b26121c8980236cbffbf944f4bf39fc68ed98..916196268967fb3ca3bb4746eeab45403b6795d8 100644 (file)
@@ -37,7 +37,8 @@ static char* strsignal(int sig) {
 
 /* Msg types to have different actions taken for different message types
  * REGULAR                     - Regular Charm++ message
- * ONESIDED_BUFFER             - Nocopy Entry Method API buffer
+ * ONESIDED_BUFFER_SEND        - Nocopy Entry Method API Send buffer
+ * ONESIDED_BUFFER_RECV        - Nocopy Entry Method API Recv buffer
  * ONESIDED_BUFFER_DIRECT_RECV - Nocopy Direct API Recv buffer
  * ONESIDED_BUFFER_DIRECT_SEND - Nocopy Direct API Send buffer
  * POST_DIRECT_RECV            - Metadata message with Direct Recv buffer information
@@ -45,7 +46,7 @@ static char* strsignal(int sig) {
  * */
 
 #define CMI_MSGTYPE(msg)            ((CmiMsgHeaderBasic *)msg)->msgType
-enum mpiMsgTypes { REGULAR, ONESIDED_BUFFER, ONESIDED_BUFFER_DIRECT_RECV, ONESIDED_BUFFER_DIRECT_SEND, POST_DIRECT_RECV, POST_DIRECT_SEND};
+enum mpiMsgTypes { REGULAR, ONESIDED_BUFFER_SEND, ONESIDED_BUFFER_RECV, ONESIDED_BUFFER_DIRECT_RECV, ONESIDED_BUFFER_DIRECT_SEND, POST_DIRECT_RECV, POST_DIRECT_SEND};
 
 /* =======Beginning of Definitions of Performance-Specific Macros =======*/
 /* Whether to use multiple send queue in SMP mode */
@@ -632,13 +633,26 @@ static void ReleasePostedMessages(void) {
                 prev->next = temp;
 #if CMK_ONESIDED_IMPL
             //if rdma msg, call the callback
-            if(msg_tmp->type == ONESIDED_BUFFER) {
+            if(msg_tmp->type == ONESIDED_BUFFER_SEND) {
                 CmiMPIRzvRdmaOpInfo_t *rdmaOpInfo = (CmiMPIRzvRdmaOpInfo_t *)msg_tmp->ref;
                 CmiRdmaAck *ack = (CmiRdmaAck *) rdmaOpInfo->ack;
                 ack->fnPtr(ack->token);
 
                 //free callback structure, CmiRdmaAck allocated in CmiSetRdmaAck
                 free(ack);
+            } else if(msg_tmp->type == ONESIDED_BUFFER_RECV) {
+
+                // get hold of CmiMPIRzvRdmaRecvList_t
+                CmiMPIRzvRdmaRecvOp_t *rdmaRecvOpInfo = (CmiMPIRzvRdmaRecvOp_t *)msg_tmp->ref;
+
+                CmiMPIRzvRdmaRecvList_t *rdmaRecvInfo = (CmiMPIRzvRdmaRecvList_t *)((char *)(rdmaRecvOpInfo)
+                                                        - rdmaRecvOpInfo->opIndex * LrtsGetRdmaOpRecvInfoSize()
+                                                        - LrtsGetRdmaGenRecvInfoSize());
+
+                rdmaRecvInfo->counter++;
+                if(rdmaRecvInfo->counter == rdmaRecvInfo->numOps) {
+                    handleOneRecvedMsg(rdmaRecvInfo->msgLen, (char *)rdmaRecvInfo->msg);
+                }
             } else if(msg_tmp->type == ONESIDED_BUFFER_DIRECT_RECV) {
                 // MPI_Irecv posted as a part of the Direct API was completed
                 NcpyOperationInfo *ncpyOpInfo = (NcpyOperationInfo *)(msg_tmp->ref);
@@ -707,43 +721,6 @@ static int PumpMsgs(void) {
 #endif
 
     while (1) {
-#if CMK_ONESIDED_IMPL
-        //Wait for the completion of rdma buffers (recvs posted in LrtsIssueRgets)
-        while(recvBufferTmp != 0){
-          allOpsDone=1;
-
-          for(i=0; i<recvBufferTmp->numOps; i++){
-            if(recvBufferTmp->rdmaOp[i].hasCompleted == 0){
-
-              if(MPI_Test(&(recvBufferTmp->rdmaOp[i].req), &opDone, MPI_STATUS_IGNORE) != MPI_SUCCESS)
-                CmiAbort("ReleasePostedMessages: MPI_Test for Rdma buffers failed!\n");
-              //recv has completed
-              if(opDone)
-                recvBufferTmp->rdmaOp[i].hasCompleted = 1;
-              else
-                allOpsDone = 0;
-            }
-          }
-          //all recvs for this message have completed i.e numops number of recvs
-          if(allOpsDone){
-            handleOneRecvedMsg(recvBufferTmp->msgLen, (char *)recvBufferTmp->msg);
-            //remove from the list
-            CpvAccess(RdmaRecvQueueLen)--;
-            temp = recvBufferTmp->next;
-            if(prev==0)
-              CpvAccess(recvRdmaBuffers)=temp;
-            else
-              prev->next = temp;
-
-            recvBufferTmp = temp;
-          }
-          else{
-            prev = recvBufferTmp;
-            recvBufferTmp = recvBufferTmp->next;
-          }
-        }
-        CpvAccess(endRdmaBuffer) = prev;
-#endif
         int doSyncRecv = 1;
 #if CMI_EXERT_RECV_CAP
         if (recvCnt==RECV_CAP) break;
@@ -1103,10 +1080,14 @@ static int SendMsgBuf(void) {
 #endif
 
 #if CMK_ONESIDED_IMPL
-            if(msg_tmp->type == ONESIDED_BUFFER) {
+            if(msg_tmp->type == ONESIDED_BUFFER_SEND) {
                 CmiMPIRzvRdmaOpInfo_t *rdmaOpInfo = (CmiMPIRzvRdmaOpInfo_t *)(msg_tmp->ref);
                 MPISendOrRecvOneBuffer(msg_tmp, rdmaOpInfo->tag);
             }
+            else if(msg_tmp->type == ONESIDED_BUFFER_RECV) {
+                CmiMPIRzvRdmaRecvOp_t *rdmaRecvOpInfo = (CmiMPIRzvRdmaRecvOp_t *)(msg_tmp->ref);
+                MPISendOrRecvOneBuffer(msg_tmp, rdmaRecvOpInfo->tag);
+            }
             else if(msg_tmp->type == ONESIDED_BUFFER_DIRECT_RECV || msg_tmp->type == ONESIDED_BUFFER_DIRECT_SEND) {
                 CmiMPIRzvRdmaAckInfo_t *ack = (CmiMPIRzvRdmaAckInfo_t *)(msg_tmp->ref);
                 MPISendOrRecvOneBuffer(msg_tmp, ack->tag);
index 45729bf78fd087398dba611c2ebaaacfb343fef3..0753d6708e1fbeb3a5c3349f467d7bd8a004c08c 100644 (file)
@@ -2021,7 +2021,7 @@ void CkSendMsgNodeBranchImmediate(int eIdx, void *msg, int node, CkGroupID gID)
 extern "C"
 void CkSendMsgNodeBranchInline(int eIdx, void *msg, int node, CkGroupID gID, int opts)
 {
-  if (node==CkMyNode())
+  if (node==CkMyNode() && ((envelope *)(UsrToEnv(msg)))->isRdma() == false)
   {
     CmiImmediateLock(CksvAccess(_nodeGroupTableImmLock));
     void *obj = CksvAccess(_nodeGroupTable)->find(gID).getObj();
index 37dcf6eb74794db64d5dbd06bd3e435a7c08a2f1..8636adeae32e049c76200b44a8096ade3eb320fb 100644 (file)
@@ -7,7 +7,7 @@
 #include "cmirdmautils.h"
 
 
-#if CMK_SMP && CMK_IMMEDIATE_MSG
+#if CMK_SMP
 /*readonly*/ extern CProxy_ckcallback_group _ckcallbackgroup;
 #endif
 
@@ -91,8 +91,7 @@ envelope* CkRdmaCreateMetadataMsg(envelope *env, int pe){
 void CkHandleRdmaCookie(void *cookie){
   CkRdmaWrapper *w = (CkRdmaWrapper *)cookie;
   CkCallback *cb= w->callback;
-
-#if CMK_SMP && CMK_IMMEDIATE_MSG
+#if CMK_SMP
   //call to callbackgroup to call the callback when calling from comm thread
   //this add one more trip through the scheduler
   _ckcallbackgroup[w->srcPe].call(*cb, sizeof(void *), (char*)&w->ptr);
@@ -189,16 +188,16 @@ void CkRdmaIssueRgets(envelope *env){
   //Receiver's machine specific info is at an offset, after the sender md and the receiver's buffer
   char *recv_md = ((char *)copyenv) + CK_ALIGN(msgsize, 16) + bufsize;
 
-  CkUnpackMessage(&copyenv);
-  CkUpdateRdmaPtrs(copyenv, msgsize, recv_md, ((char *)env) + msgsize);
-  CkPackRdmaPtrs(((CkMarshallMsg *)EnvToUsr(copyenv))->msgBuf);
-  CkPackMessage(&copyenv);
-
   /* Set the total size of the message excluding the receiver's machine specific info
    * which is not required when the receiver's entry method executes
    */
   copyenv->setTotalsize(CK_ALIGN(msgsize, 16) + bufsize);
 
+  CkUnpackMessage(&copyenv);
+  CkUpdateRdmaPtrs(copyenv, msgsize, recv_md, ((char *)env) + msgsize);
+  CkPackRdmaPtrs(((CkMarshallMsg *)EnvToUsr(copyenv))->msgBuf);
+  CkPackMessage(&copyenv);
+
   // Set rdma to be false to prevent message handler on the receiver
   // from intercepting it
   copyenv->setRdma(false);