ZC OFI API: Replace fi_write with fi_writemsg with FI_DELIVERY_COMPLETE 97/4597/5
authorNitin Bhat <nbhat4@illinois.edu>
Fri, 21 Sep 2018 18:37:55 +0000 (14:37 -0400)
committerNitin Bhat <nbhat4@illinois.edu>
Tue, 25 Sep 2018 14:38:57 +0000 (09:38 -0500)
Previously, fi_write would complete only when the source could
reuse its buffer. With this change, an fi_writemsg completes only when
the destination buffer has received the data. This change is required
to solve a rare race condition which occurs in the UNREG mode of operation,
where a Put operation is performed instead of a Get operation. The race
condition causes the source to send a message to the destination to
potentially de-register the destination buffer when the completion of the
write operation on the destination is uncertain i.e. the data could still
be in-flight. This patch fixes that case as completion on the source only
occurs after the destination has received the data through the RDMA write
operation.

Change-Id: I808f1d5bc9dda3d92859e9775531d0b5c47a1c8e

src/arch/ofi/machine-onesided.c

index 8a800f689198df40f66fafc3cce4ced57e69995e..8fdf9c8f501edde02650fbaa8cc370d531e631f6 100644 (file)
@@ -137,14 +137,30 @@ void ofi_post_nocopy_operation(
                         &rma_req->context));
     } else if(operation == OFI_WRITE_OP) {
       // Perform an RDMA write or put operation
-      OFI_RETRY(fi_write(context.ep,
-                        lbuf,
-                        chunk_size,
-                        (lmr) ? fi_mr_desc(lmr) : NULL,
-                        remoteNodeNo,
-                        (uint64_t)rbuf,
-                        rkey,
-                        &rma_req->context));
+      struct iovec l_iovec{};
+      l_iovec.iov_base = (void*)lbuf;
+      l_iovec.iov_len = chunk_size;
+
+      struct fi_rma_iov rma_iov{};
+      rma_iov.addr = (uint64_t)rbuf;
+      rma_iov.len = chunk_size;
+      rma_iov.key = rkey;
+
+      void *desc = (lmr ? fi_mr_desc(lmr) : NULL);
+
+      struct fi_msg_rma msg{};
+      msg.msg_iov = &l_iovec;
+      msg.desc = &desc;
+      msg.iov_count = 1;
+      msg.addr = (fi_addr_t)remoteNodeNo;
+      msg.rma_iov = &rma_iov;
+      msg.rma_iov_count = 1;
+      msg.context = &rma_req->context;
+      msg.data = 0;
+
+      OFI_RETRY(fi_writemsg(context.ep,
+                           &msg,
+                           FI_DELIVERY_COMPLETE));
     } else {
       CmiAbort("ofi_post_nocopy_operation: Invalid RDMA operation\n");
     }