179c2295e4e77d1a6fe719933001403ae346a01b
[charm.git] / src / arch / bluegenep / machine.c
1
2 #include <stdio.h>
3 #include <errno.h>
4 #include <stdlib.h>
5 #include <unistd.h>
6 #include <math.h>
7 #include <string.h>
8 #include "machine.h"
9 #include "converse.h"
10 #include "pcqueue.h"
11 #include "assert.h"
12 #include "malloc.h"
13
14 #include <bpcore/ppc450_inlines.h>
15
16 #include "dcmf.h"
17 #include "dcmf_multisend.h"
18
19 char *ALIGN_16(char *p) {
20     return((char *)((((unsigned long)p)+0xf)&0xfffffff0));
21 }
22
23 #define PROGRESS_PERIOD 1024
24
25 //There are two roles of comm thread:
26 // 1. polling other cores' bcast msg queue
27 // 2. bcast msg is handled only by comm thd
28 #define BCASTMSG_ONLY_TO_COMMTHD 0
29
30 CpvDeclare(PCQueue, broadcast_q);                 //queue to send broadcast messages
31 #if CMK_NODE_QUEUE_AVAILABLE
32 CsvDeclare(PCQueue, node_bcastq);
33 CsvDeclare(CmiNodeLock, node_bcastLock);
34 #endif
35
36 /*
37     To reduce the buffer used in broadcast and distribute the load from
38   broadcasting node, define CMK_BROADCAST_SPANNING_TREE enforce the use of
39   spanning tree broadcast algorithm.
40     This will use the fourth short in message as an indicator of spanning tree
41   root.
42 */
43 #if CMK_SMP
44 #define CMK_BROADCAST_SPANNING_TREE   1
45 #else
46 #define CMK_BROADCAST_SPANNING_TREE    1
47 #define CMK_BROADCAST_HYPERCUBE        0
48 #endif /* CMK_SMP */
49
50 #define BROADCAST_SPANNING_FACTOR     2
51
52 //The root of the message infers the type of the message
53 // 1. root is 0, then it is a normal point-to-point message
54 // 2. root is larger than 0 (>=1), then it is a broadcast message across all processors (cores)
55 // 3. root is less than 0 (<=-1), then it is a broadcast message across all nodes
56 #define CMI_BROADCAST_ROOT(msg)          ((CmiMsgHeaderBasic *)msg)->root
57 #define CMI_IS_BCAST_ON_CORES(msg) (CMI_BROADCAST_ROOT(msg) > 0)
58 #define CMI_IS_BCAST_ON_NODES(msg) (CMI_BROADCAST_ROOT(msg) < 0)
59 #define CMI_GET_CYCLE(msg)               ((CmiMsgHeaderBasic *)msg)->root
60
61 #define CMI_DEST_RANK(msg)               ((CmiMsgHeaderBasic *)msg)->rank
62 #define CMI_MAGIC(msg)                   ((CmiMsgHeaderBasic *)msg)->magic
63
64 /* FIXME: need a random number that everyone agrees ! */
65 #define CHARM_MAGIC_NUMBER               126
66
67 #if CMK_ERROR_CHECKING
68 static int checksum_flag = 0;
69 extern unsigned char computeCheckSum(unsigned char *data, int len);
70
71 #define CMI_SET_CHECKSUM(msg, len)      \
72         if (checksum_flag)  {   \
73           ((CmiMsgHeaderBasic *)msg)->cksum = 0;        \
74           ((CmiMsgHeaderBasic *)msg)->cksum = computeCheckSum((unsigned char*)msg, len);        \
75         }
76
77 #define CMI_CHECK_CHECKSUM(msg, len)    \
78         if (checksum_flag)      \
79           if (computeCheckSum((unsigned char*)msg, len) != 0)  { \
80             printf("\n\n------------------------------\n\nReceiver %d size %d:", CmiMyPe(), len); \
81             for(count = 0; count < len; count++) { \
82                 printf("%2x", msg[count]);                 \
83             }                                             \
84             printf("------------------------------\n\n"); \
85             CmiAbort("Fatal error: checksum doesn't agree!\n"); \
86           }
87 #else
88 #define CMI_SET_CHECKSUM(msg, len)
89 #define CMI_CHECK_CHECKSUM(msg, len)
90 #endif
91
92 #define CMI_SET_BROADCAST_ROOT(msg, root)  CMI_BROADCAST_ROOT(msg) = (root);
93
94 #if CMK_BROADCAST_HYPERCUBE
95 #  define CMI_SET_CYCLE(msg, cycle)  CMI_GET_CYCLE(msg) = (cycle);
96 #else
97 #  define CMI_SET_CYCLE(msg, cycle)
98 #endif
99
100 int               _Cmi_numpes;
101 int               _Cmi_mynode;    /* Which address space am I */
102 int               _Cmi_mynodesize;/* Number of processors in my address space */
103 int               _Cmi_numnodes;  /* Total number of address spaces */
104 int                Cmi_nodestart; /* First processor in this address space */
105 CpvDeclare(void*, CmiLocalQueue);
106
107
108 #if CMK_NODE_QUEUE_AVAILABLE
109 #define SMP_NODEMESSAGE   (0xFB) // rank of the node message when node queue
110 // is available
111 #define NODE_BROADCAST_OTHERS (-1)
112 #define NODE_BROADCAST_ALL    (-2)
113 #endif
114
115
116 typedef struct ProcState {
117     /* PCQueue      sendMsgBuf; */      /* per processor message sending queue */
118     CmiNodeLock  recvLock;              /* for cs->recv */
119     CmiNodeLock bcastLock;
120 } ProcState;
121
122 static ProcState  *procState;
123
124 #if CMK_SMP && !CMK_MULTICORE
125 static volatile int commThdExit = 0;
126 static CmiNodeLock commThdExitLock = 0;
127 #endif
128
129 void ConverseRunPE(int everReturn);
130 static void CommunicationServer(int sleepTime);
131 static void CommunicationServerThread(int sleepTime);
132
133 //So far we dont define any comm threads
134 int Cmi_commthread = 0;
135
136 #include "machine-smp.c"
137 CsvDeclare(CmiNodeState, NodeState);
138 #include "immediate.c"
139
140 void AdvanceCommunications();
141
142
143 #if !CMK_SMP
144 /************ non SMP **************/
145 static struct CmiStateStruct Cmi_state;
146 int _Cmi_mype;
147 int _Cmi_myrank;
148
149 void CmiMemLock(void) {}
150 void CmiMemUnlock(void) {}
151
152 #define CmiGetState() (&Cmi_state)
153 #define CmiGetStateN(n) (&Cmi_state)
154
155 //void CmiYield(void) { sleep(0); }
156
157 static void CmiStartThreads(char **argv) {
158     CmiStateInit(Cmi_nodestart, 0, &Cmi_state);
159     _Cmi_mype = Cmi_nodestart;
160     _Cmi_myrank = 0;
161 }
162 #endif  /* !CMK_SMP */
163
164 //int received_immediate;
165 //int received_broadcast;
166
167 /*Add a message to this processor's receive queue, pe is a rank */
168 void CmiPushPE(int pe,void *msg) {
169     CmiState cs = CmiGetStateN(pe);
170     MACHSTATE2(3,"Pushing message into rank %d's queue %p{",pe, cs->recv);
171 #if CMK_IMMEDIATE_MSG
172     if (CmiIsImmediate(msg)) {
173         /**(CmiUInt2 *)msg = pe;*/
174         //received_immediate = 1;
175         //printf("PushPE: N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
176         //CMI_DEST_RANK(msg) = pe;
177         CmiPushImmediateMsg(msg);
178         return;
179     }
180 #endif
181 #if CMK_SMP
182     CmiLock(procState[pe].recvLock);
183 #endif
184
185     PCQueuePush(cs->recv,(char *)msg);
186     //printf("%d: PCQueue length = %d, msg = %x\n", CmiMyPe(), PCQueueLength(cs->recv), msg);
187
188 #if CMK_SMP
189     CmiUnlock(procState[pe].recvLock);
190 #endif
191     CmiIdleLock_addMessage(&cs->idle);
192     MACHSTATE1(3,"} Pushing message into rank %d's queue done",pe);
193 }
194
195 #if CMK_NODE_QUEUE_AVAILABLE
196 /*Add a message to this processor's receive queue */
197 static void CmiPushNode(void *msg) {
198     MACHSTATE(3,"Pushing message into NodeRecv queue");
199 #if CMK_IMMEDIATE_MSG
200     if (CmiIsImmediate(msg)) {
201         //printf("PushNode: N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
202         //CMI_DEST_RANK(msg) = 0;
203         CmiPushImmediateMsg(msg);
204         return;
205     }
206 #endif
207     CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
208     PCQueuePush(CsvAccess(NodeState).NodeRecv,msg);
209     CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
210     {
211         CmiState cs=CmiGetStateN(0);
212         CmiIdleLock_addMessage(&cs->idle);
213     }
214 }
215 #endif /* CMK_NODE_QUEUE_AVAILABLE */
216
217 volatile int msgQueueLen;
218 volatile int outstanding_recvs;
219
220 static int Cmi_dim;     /* hypercube dim of network */
221
222 static char     **Cmi_argv;
223 static char     **Cmi_argvcopy;
224 static CmiStartFn Cmi_startfn;   /* The start function */
225 static int        Cmi_usrsched;  /* Continue after start function finishes? */
226
227 extern void ConverseCommonInit(char **argv);
228 extern void ConverseCommonExit(void);
229 extern void CthInit(char **argv);
230
231 static void SendMsgsUntil(int);
232
233
234 void SendSpanningChildren(int size, char *msg);
235 #if CMK_NODE_QUEUE_AVAILABLE
236 void SendSpanningChildrenNode(int size, char *msg);
237 #endif
238 void SendHypercube(int size, char *msg);
239
240 DCMF_Protocol_t  cmi_dcmf_short_registration __attribute__((__aligned__(16)));
241 DCMF_Protocol_t  cmi_dcmf_eager_registration __attribute__((__aligned__(16)));
242 DCMF_Protocol_t  cmi_dcmf_rzv_registration   __attribute__((__aligned__(16)));
243 DCMF_Protocol_t  cmi_dcmf_multicast_registration   __attribute__((__aligned__(16)));
244
245
246 #define BGP_USE_AM_DIRECT 1
247 //#define BGP_USE_RDMA_DIRECT 1
248 //#define CMI_DIRECT_DEBUG 1
249 #ifdef BGP_USE_AM_DIRECT
250
251
252 DCMF_Protocol_t  cmi_dcmf_direct_registration __attribute__((__aligned__(16)));
253 /** The receive side of a put implemented in DCMF_Send */
254
255
256 typedef struct {
257     void *recverBuf;
258   void (*callbackFnPtr)(void *);
259     void *callbackData;
260     DCMF_Request_t *DCMF_rq_t;
261 } dcmfDirectMsgHeader;
262
263 /* nothing for us to do here */
264 #if (DCMF_VERSION_MAJOR >= 2)
265 void direct_send_done_cb(void*nothing, DCMF_Error_t *err) 
266 #else 
267   void direct_send_done_cb(void*nothing) 
268 #endif
269 {
270 #if CMI_DIRECT_DEBUG
271   CmiPrintf("[%d] RDMA send_done_cb\n", CmiMyPe());
272 #endif
273 }
274
275 DCMF_Callback_t  directcb;
276
277 void     direct_short_pkt_recv (void             * clientdata,
278                                 const DCQuad     * info,
279                                 unsigned           count,
280                                 unsigned           senderrank,
281                                 const char       * buffer,
282                                 const unsigned     sndlen) {
283 #if CMI_DIRECT_DEBUG
284     CmiPrintf("[%d] RDMA direct_short_pkt_recv\n", CmiMyPe());
285 #endif
286     dcmfDirectMsgHeader *msgHead=  (dcmfDirectMsgHeader *) info;
287     CmiMemcpy(msgHead->recverBuf, buffer, sndlen);
288     (*(msgHead->callbackFnPtr))(msgHead->callbackData);
289 }
290
291
292 #if (DCMF_VERSION_MAJOR >= 2)
293 typedef void (*cbhdlr) (void *, DCMF_Error_t *);
294 #else
295 typedef void (*cbhdlr) (void *);
296 #endif
297
298 DCMF_Request_t * direct_first_pkt_recv_done (void              * clientdata,
299         const DCQuad      * info,
300         unsigned            count,
301         unsigned            senderrank,
302         const unsigned      sndlen,
303         unsigned          * rcvlen,
304         char             ** buffer,
305         DCMF_Callback_t   * cb
306                                             ) {
307 #if CMI_DIRECT_DEBUG
308     CmiPrintf("[%d] RDMA direct_first_pkt_recv_done\n", CmiMyPe());
309 #endif
310     /* pull the data we need out of the header */
311     *rcvlen=sndlen;
312     dcmfDirectMsgHeader *msgHead=  (dcmfDirectMsgHeader *) info;
313     cb->function= (cbhdlr)msgHead->callbackFnPtr;
314     cb->clientdata=msgHead->callbackData;
315     *buffer=msgHead->recverBuf;
316     return msgHead->DCMF_rq_t;
317 }
318
319
320 #endif
321
322 #ifdef BGP_USE_RDMA_DIRECT
323 static struct DCMF_Callback_t dcmf_rdma_cb_ack;
324
325
326 DCMF_Protocol_t  cmi_dcmf_direct_put_registration __attribute__((__aligned__(16)));
327
328 DCMF_Protocol_t  cmi_dcmf_direct_get_registration __attribute__((__aligned__(16)));
329
330 DCMF_Protocol_t  cmi_dcmf_direct_rdma_registration __attribute__((__aligned__(16)));
331 /** The receive side of a DCMF_Put notification implemented in DCMF_Send */
332
333 typedef struct {
334   void (*callbackFnPtr)(void *);
335     void *callbackData;
336 } dcmfDirectRDMAMsgHeader;
337
338
339
340 #if (DCMF_VERSION_MAJOR >= 2)
341 void direct_send_rdma_done_cb(void*nothing, DCMF_Error_t *err) 
342 #else 
343   void direct_send_rdma_done_cb(void*nothing) 
344 #endif
345 {
346 #if CMI_DIRECT_DEBUG
347   CmiPrintf("[%d] RDMA send_rdma_done_cb result %d\n", CmiMyPe());
348 #endif
349
350
351 }
352
353 DCMF_Callback_t  directcb;
354
355 void     direct_short_rdma_pkt_recv (void             * clientdata,
356                                 const DCQuad     * info,
357                                 unsigned           count,
358                                 unsigned           senderrank,
359                                 const char       * buffer,
360                                 const unsigned     sndlen) {
361 #if CMI_DIRECT_DEBUG
362     CmiPrintf("[%d] RDMA direct_short_rdma_pkt_recv\n", CmiMyPe());
363 #endif
364     dcmfDirectRDMAMsgHeader *msgHead=  (dcmfDirectRDMAMsgHeader *) info;
365     (*(msgHead->callbackFnPtr))(msgHead->callbackData);
366 }
367
368
369 #if (DCMF_VERSION_MAJOR >= 2)
370 typedef void (*cbhdlr) (void *, DCMF_Error_t *);
371 #else
372 typedef void (*cbhdlr) (void *);
373 #endif
374
375 DCMF_Request_t * direct_first_rdma_pkt_recv_done (void              * clientdata,
376         const DCQuad      * info,
377         unsigned            count,
378         unsigned            senderrank,
379         const unsigned      sndlen,
380         unsigned          * rcvlen,
381         char             ** buffer,
382         DCMF_Callback_t   * cb
383                                             ) {
384     CmiAbort("direct_first_rdma_pkt_recv should not be called");
385 }
386
387
388 #endif
389
390
391 typedef struct msg_list {
392     char              * msg;
393     int                 size;
394     int                 destpe;
395     int               * pelist;
396     DCMF_Callback_t     cb;
397     DCQuad              info __attribute__((__aligned__(16)));
398     DCMF_Request_t      send __attribute__((__aligned__(16)));
399 } SMSG_LIST __attribute__((__aligned__(16)));
400
401 #define MAX_NUM_SMSGS   64
402 CpvDeclare(PCQueue, smsg_list_q);
403
404 static inline SMSG_LIST * smsg_allocate() {
405     SMSG_LIST *smsg = (SMSG_LIST *)PCQueuePop(CpvAccess(smsg_list_q));
406     if (smsg != NULL)
407         return smsg;
408
409     void * buf = malloc(sizeof(SMSG_LIST)); 
410     assert(buf!=NULL);
411     assert (((unsigned)buf & 0x0f) == 0);
412
413     return (SMSG_LIST *) buf;
414 }
415
416 static inline void smsg_free (SMSG_LIST *smsg) {
417     int size = PCQueueLength (CpvAccess(smsg_list_q));
418     if (size < MAX_NUM_SMSGS)
419         PCQueuePush (CpvAccess(smsg_list_q), (char *) smsg);
420     else
421         free (smsg);
422 }
423
424 typedef struct {
425     int sleepMs; /*Milliseconds to sleep while idle*/
426     int nIdles; /*Number of times we've been idle in a row*/
427     CmiState cs; /*Machine state*/
428 } CmiIdleState;
429
430 static CmiIdleState *CmiNotifyGetState(void) {
431     CmiIdleState *s=(CmiIdleState *)CmiAlloc(sizeof(CmiIdleState));
432     s->sleepMs=0;
433     s->nIdles=0;
434     s->cs=CmiGetState();
435     return s;
436 }
437
438
439 #if (DCMF_VERSION_MAJOR >= 2)
440 static void send_done(void *data, DCMF_Error_t *err) 
441 #else 
442 static void send_done(void *data) 
443 #endif
444 /* send done callback: sets the smsg entry to done */
445 {
446     SMSG_LIST *msg_tmp = (SMSG_LIST *)(data);
447     CmiFree(msg_tmp->msg);
448     //free(data);
449     smsg_free (msg_tmp);
450
451     msgQueueLen--;
452 }
453
454 #if (DCMF_VERSION_MAJOR >= 2)
455 static void send_multi_done(void *data, DCMF_Error_t *err) 
456 #else 
457 static void send_multi_done(void *data) 
458 #endif
459 /* send done callback: sets the smsg entry to done */
460 {
461     SMSG_LIST *msg_tmp = (SMSG_LIST *)(data);
462     CmiFree(msg_tmp->msg);
463     free(msg_tmp->pelist);
464
465     smsg_free(msg_tmp);
466
467     msgQueueLen--;
468 }
469
470
471 #if (DCMF_VERSION_MAJOR >= 2)
472 static void recv_done(void *clientdata, DCMF_Error_t * err) 
473 #else 
474 static void recv_done(void *clientdata) 
475 #endif
476 /* recv done callback: push the recved msg to recv queue */
477 {
478
479     char *msg = (char *) clientdata;
480     int sndlen = ((CmiMsgHeaderBasic *) msg)->size;
481
482     //fprintf (stderr, "%d Recv message done \n", CmiMyPe());
483
484     /* then we do what PumpMsgs used to do:
485      * push msg to recv queue */
486     int count=0;
487     CMI_CHECK_CHECKSUM(msg, sndlen);
488     if (CMI_MAGIC(msg) != CHARM_MAGIC_NUMBER) { /* received a non-charm msg */
489         CmiAbort("Charm++ Warning: Non Charm++ Message Received. \n");
490         return;
491     }
492
493 #if CMK_BROADCAST_SPANNING_TREE | CMK_BROADCAST_HYPERCUBE
494     if (CMI_IS_BCAST_ON_CORES(msg) ) {
495         int pe = CMI_DEST_RANK(msg);
496
497         //printf ("%d: Receiving bcast message from %d with %d bytes for %d\n", CmiMyPe(), CMI_BROADCAST_ROOT(msg), sndlen, pe);
498
499         char *copymsg;
500         copymsg = (char *)CmiAlloc(sndlen);
501         CmiMemcpy(copymsg,msg,sndlen);
502
503         //received_broadcast = 1;
504 #if CMK_SMP
505         CmiLock(procState[pe].bcastLock);
506         PCQueuePush(CpvAccessOther(broadcast_q, pe), copymsg);
507         CmiUnlock(procState[pe].bcastLock);
508 #else
509         PCQueuePush(CpvAccess(broadcast_q), copymsg);
510 #endif
511     }
512 #endif
513
514 #if CMK_NODE_QUEUE_AVAILABLE
515 #if CMK_BROADCAST_SPANNING_TREE
516     if (CMI_IS_BCAST_ON_NODES(msg)) {
517         //printf ("%d: Receiving node bcast message from %d with %d bytes for %d\n", CmiMyPe(), CMI_BROADCAST_ROOT(msg), sndlen, CMI_DEST_RANK(msg));
518         char *copymsg = (char *)CmiAlloc(sndlen);
519         CmiMemcpy(copymsg,msg,sndlen);
520         //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
521         CmiLock(CsvAccess(node_bcastLock));
522         PCQueuePush(CsvAccess(node_bcastq), copymsg);
523         CmiUnlock(CsvAccess(node_bcastLock));
524         //CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
525     }
526 #endif
527     if (CMI_DEST_RANK(msg) == SMP_NODEMESSAGE)
528         CmiPushNode(msg);
529     else
530 #endif
531
532 #if CMK_SMP && !CMK_MULTICORE && BCASTMSG_ONLY_TO_COMMTHD
533         if (CMI_DEST_RANK(msg)<_Cmi_mynodesize) { //not the comm thd
534             CmiPushPE(CMI_DEST_RANK(msg), (void *)msg);
535         }
536 #else
537         CmiPushPE(CMI_DEST_RANK(msg), (void *)msg);
538 #endif
539
540     outstanding_recvs --;
541 }
542
543
544 void     short_pkt_recv (void             * clientdata,
545                          const DCQuad     * info,
546                          unsigned           count,
547                          unsigned           senderrank,
548                          const char       * buffer,
549                          const unsigned     sndlen) {
550     outstanding_recvs ++;
551     int alloc_size = sndlen;
552
553     char * new_buffer = (char *)CmiAlloc(alloc_size);
554     CmiMemcpy (new_buffer, buffer, sndlen);
555     
556 #if (DCMF_VERSION_MAJOR >= 2)
557     recv_done (new_buffer, NULL);
558 #else
559     recv_done (new_buffer);
560 #endif
561 }
562
563
564 DCMF_Request_t * first_multi_pkt_recv_done (const DCQuad      * info,
565                                             unsigned            count,
566                                             unsigned            senderrank,                                    
567                                             const unsigned      sndlen,
568                                             unsigned            connid,
569                                             void              * clientdata,
570                                             unsigned          * rcvlen,
571                                             char             ** buffer,
572                                             unsigned          * pw,
573                                             DCMF_Callback_t   * cb
574                                             ) {
575     outstanding_recvs ++;
576     int alloc_size = sndlen + sizeof(DCMF_Request_t) + 16;
577
578     //printf ("%d: Receiving message %d bytes from %d\n", CmiMyPe(), sndlen, senderrank);
579
580     /* printf ("Receiving %d bytes\n", sndlen); */
581     *rcvlen = sndlen;  /* to avoid malloc(0) which might
582                                    return NULL */
583
584     *buffer = (char *)CmiAlloc(alloc_size);
585     cb->function = recv_done;
586     cb->clientdata = *buffer;
587
588     *pw  = 0x7fffffff;
589     return (DCMF_Request_t *) ALIGN_16(*buffer + sndlen);
590 }
591
592
593 DCMF_Request_t * first_pkt_recv_done (void              * clientdata,
594                                       const DCQuad      * info,
595                                       unsigned            count,
596                                       unsigned            senderrank,
597                                       const unsigned      sndlen,
598                                       unsigned          * rcvlen,
599                                       char             ** buffer,
600                                       DCMF_Callback_t   * cb
601                                      ) {
602     outstanding_recvs ++;
603     int alloc_size = sndlen + sizeof(DCMF_Request_t) + 16;
604
605     //printf ("%d: Receiving message %d bytes from %d\n", CmiMyPe(), sndlen, senderrank);
606
607     /* printf ("Receiving %d bytes\n", sndlen); */
608     *rcvlen = sndlen;  /* to avoid malloc(0) which might
609                                    return NULL */
610
611     *buffer = (char *)CmiAlloc(alloc_size);
612     cb->function = recv_done;
613     cb->clientdata = *buffer;
614
615     return (DCMF_Request_t *) ALIGN_16(*buffer + sndlen);
616 }
617
618
619 #if CMK_NODE_QUEUE_AVAILABLE
620 void sendBroadcastMessagesNode() {
621     if (PCQueueLength(CsvAccess(node_bcastq))==0) return;
622     //node broadcast message could be always handled by any cores (including
623     //comm thd) on this node
624     //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
625     CmiLock(CsvAccess(node_bcastLock));
626     char *msg = PCQueuePop(CsvAccess(node_bcastq));
627     CmiUnlock(CsvAccess(node_bcastLock));
628     //CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
629     while (msg) {
630 #if CMK_BROADCAST_SPANNING_TREE
631         //printf("sendBroadcastMessagesNode: node %d rank %d with msg root %d\n", CmiMyNode(), CmiMyRank(), CMI_BROADCAST_ROOT(msg));
632         SendSpanningChildrenNode(((CmiMsgHeaderBasic *) msg)->size, msg);
633 #endif
634         CmiFree(msg);
635         //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
636         CmiLock(CsvAccess(node_bcastLock));
637         msg = PCQueuePop(CsvAccess(node_bcastq));
638         CmiUnlock(CsvAccess(node_bcastLock));
639         //CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
640     }
641 }
642 #endif
643
644 void sendBroadcastMessages() {
645 #if !CMK_MULTICORE && !BCASTMSG_ONLY_TO_COMMTHD
646 //in the presence of comm thd, and it is not responsible for broadcasting msg,
647 //the comm thd will help to pull the msg from rank 0 which is predefined as the
648 //core on a smp node to receive bcast msg.
649     int toPullRank = CmiMyRank();
650     if (CmiMyRank()==_Cmi_mynodesize) toPullRank = 0; //comm thd only pulls msg from rank 0
651 #else
652     int toPullRank = CmiMyRank();
653 #endif
654     PCQueue toPullQ;
655
656     /*
657     if(CmiMyRank()==_Cmi_mynodesize)
658       printf("Comm thd on node [%d] is pulling bcast msg\n", CmiMyNode());
659     else
660       printf("Work thd [%d] on node [%d] is pulling bcast msg\n", CmiMyRank(), CmiMyNode());
661     */
662 #if !CMK_MULTICORE && !BCASTMSG_ONLY_TO_COMMTHD
663     toPullQ = CpvAccessOther(broadcast_q, toPullRank);
664 #else
665     toPullQ = CpvAccess(broadcast_q);
666 #endif
667
668     if (PCQueueLength(toPullQ)==0) return;
669 #if CMK_SMP
670     CmiLock(procState[toPullRank].bcastLock);
671 #endif
672
673     char *msg = (char *) PCQueuePop(toPullQ);
674
675 #if CMK_SMP
676     CmiUnlock(procState[toPullRank].bcastLock);
677 #endif
678
679     while (msg) {
680
681 #if CMK_BROADCAST_SPANNING_TREE
682         SendSpanningChildren(((CmiMsgHeaderBasic *) msg)->size, msg);
683 #elif CMK_BROADCAST_HYPERCUBE
684         SendHypercube(((CmiMsgHeaderBasic *) msg)->size, msg);
685 #endif
686
687         CmiFree (msg);
688
689 #if CMK_SMP
690         CmiLock(procState[toPullRank].bcastLock);
691 #endif
692
693 #if !CMK_MULTICORE && !BCASTMSG_ONLY_TO_COMMTHD
694         toPullQ = CpvAccessOther(broadcast_q, toPullRank);
695 #else
696         toPullQ = CpvAccess(broadcast_q);
697 #endif
698         msg = (char *) PCQueuePop(toPullQ);
699
700 #if CMK_SMP
701         CmiUnlock(procState[toPullRank].bcastLock);
702 #endif
703     }
704 }
705
706 CpvDeclare(unsigned, networkProgressCount);
707 int  networkProgressPeriod;
708
709 #if 0
710 unsigned int *ranklist;
711
712 BGTsC_t        barrier;
713
714 // -----------------------------------------
715 // Rectangular broadcast implementation
716 // -----------------------------------------
717
718 #define MAX_COMM  256
719 static void * comm_table [MAX_COMM];
720
721 typedef struct rectbcast_msg {
722     BGTsRC_t           request;
723     DCMF_Callback_t    cb;
724     char              *msg;
725 } RectBcastInfo;
726
727
728 static void bcast_done (void *data) {
729     RectBcastInfo *rinfo = (RectBcastInfo *) data;
730     CmiFree (rinfo->msg);
731     free (rinfo);
732 }
733
734 static  void *   getRectBcastRequest (unsigned comm) {
735     return comm_table [comm];
736 }
737
738
739 static  void *  bcast_recv     (unsigned               root,
740                                 unsigned               comm,
741                                 const unsigned         sndlen,
742                                 unsigned             * rcvlen,
743                                 char                ** rcvbuf,
744                                 DCMF_Callback_t      * const cb) {
745
746     int alloc_size = sndlen + sizeof(BGTsRC_t) + 16;
747
748     *rcvlen = sndlen;  /* to avoid malloc(0) which might
749                                    return NULL */
750
751     *rcvbuf       =  (char *)CmiAlloc(alloc_size);
752     cb->function  =   recv_done;
753     cb->clientdata = *rcvbuf;
754
755     return (BGTsRC_t *) ALIGN_16 (*rcvbuf + sndlen);
756
757 }
758
759
760 extern void bgl_machine_RectBcast (unsigned                 commid,
761                                        const char             * sndbuf,
762                                        unsigned                 sndlen) {
763     RectBcastInfo *rinfo  =   (RectBcastInfo *) malloc (sizeof(RectBcastInfo));
764     rinfo->cb.function    =   bcast_done;
765     rinfo->cb.clientdata  =   rinfo;
766
767     BGTsRC_AsyncBcast_start (commid, &rinfo->request, &rinfo->cb, sndbuf, sndlen);
768
769 }
770
771 extern void        bgl_machine_RectBcastInit  (unsigned               commID,
772             const BGTsRC_Geometry_t* geometry) {
773
774     CmiAssert (commID < 256);
775     CmiAssert (comm_table [commID] == NULL);
776
777     BGTsRC_t *request =  (BGTsRC_t *) malloc (sizeof (BGTsRC_t));
778     comm_table [commID] = request;
779
780     BGTsRC_AsyncBcast_init  (request, commID,  geometry);
781 }
782
783
784
785
786 //--------------------------------------------------------------
787 //----- End Rectangular Broadcast Implementation ---------------
788 //--------------------------------------------------------------
789 #endif
790
791 //approx sleep command
792 void mysleep (int cycles) {
793     unsigned long long start = DCMF_Timebase();
794     unsigned long long end = start + cycles;
795
796     while (start < end)
797         start = DCMF_Timebase();
798
799     return;
800 }
801
802 static void * test_buf;
803
804 void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) {
805     int n, i, count;
806
807     //fprintf(stderr, "Initializing Converse Blue Gene/P machine Layer\n");
808
809     DCMF_Messager_initialize();
810
811 #if CMK_SMP
812     DCMF_Configure_t  config_in, config_out;
813     config_in.thread_level= DCMF_THREAD_MULTIPLE;
814     config_in.interrupts  = DCMF_INTERRUPTS_OFF;
815
816     DCMF_Messager_configure(&config_in, &config_out);
817     //assert (config_out.thread_level == DCMF_THREAD_MULTIPLE); //not supported in vn mode
818 #endif
819
820     DCMF_Send_Configuration_t short_config, eager_config, rzv_config;
821
822
823     short_config.protocol      = DCMF_DEFAULT_SEND_PROTOCOL;
824     short_config.cb_recv_short = short_pkt_recv;
825     short_config.cb_recv       = first_pkt_recv_done;
826
827 #if (DCMF_VERSION_MAJOR >= 3)
828     short_config.network  = DCMF_DEFAULT_NETWORK;
829 #elif (DCMF_VERSION_MAJOR == 2)
830     short_config.network  = DCMF_DefaultNetwork;
831 #endif
832
833     eager_config.protocol      = DCMF_DEFAULT_SEND_PROTOCOL;
834     eager_config.cb_recv_short = short_pkt_recv;
835     eager_config.cb_recv       = first_pkt_recv_done;
836 #if (DCMF_VERSION_MAJOR >= 3)
837     eager_config.network  = DCMF_DEFAULT_NETWORK;
838 #elif (DCMF_VERSION_MAJOR == 2)
839     eager_config.network  = DCMF_DefaultNetwork;
840 #endif
841
842 #ifdef  OPT_RZV
843 #warning "Enabling Optimize Rzv"
844     rzv_config.protocol        = DCMF_RZV_SEND_PROTOCOL;
845 #else
846     rzv_config.protocol        = DCMF_DEFAULT_SEND_PROTOCOL;
847 #endif
848     rzv_config.cb_recv_short   = short_pkt_recv;
849     rzv_config.cb_recv         = first_pkt_recv_done;
850 #if (DCMF_VERSION_MAJOR >= 3)
851     rzv_config.network  = DCMF_DEFAULT_NETWORK;
852 #elif (DCMF_VERSION_MAJOR == 2)
853     rzv_config.network  = DCMF_DefaultNetwork;
854 #endif
855
856     DCMF_Send_register (&cmi_dcmf_short_registration, &short_config);
857     DCMF_Send_register (&cmi_dcmf_eager_registration, &eager_config);
858     DCMF_Send_register (&cmi_dcmf_rzv_registration,   &rzv_config);
859
860 #ifdef BGP_USE_AM_DIRECT
861     DCMF_Send_Configuration_t direct_config;
862     direct_config.protocol      = DCMF_DEFAULT_SEND_PROTOCOL;
863     direct_config.cb_recv_short = direct_short_pkt_recv;
864     direct_config.cb_recv       = direct_first_pkt_recv_done;
865 #if (DCMF_VERSION_MAJOR >= 3)
866     direct_config.network  = DCMF_DEFAULT_NETWORK;
867 #elif (DCMF_VERSION_MAJOR == 2)
868     direct_config.network  = DCMF_DefaultNetwork;
869 #endif
870     DCMF_Send_register (&cmi_dcmf_direct_registration,   &direct_config);
871     directcb.function=direct_send_done_cb;
872     directcb.clientdata=NULL;
873 #endif
874
875 #ifdef BGP_USE_RDMA_DIRECT
876     /* notification protocol */
877     DCMF_Send_Configuration_t direct_rdma_config;
878     direct_rdma_config.protocol      = DCMF_DEFAULT_SEND_PROTOCOL;
879     direct_rdma_config.cb_recv_short = direct_short_rdma_pkt_recv;
880     direct_rdma_config.cb_recv       = direct_first_rdma_pkt_recv_done;
881 #if (DCMF_VERSION_MAJOR >= 3)
882     direct_rdma_config.network  = DCMF_DEFAULT_NETWORK;
883 #elif (DCMF_VERSION_MAJOR == 2)
884     direct_rdma_config.network  = DCMF_DefaultNetwork;
885 #endif
886     DCMF_Send_register (&cmi_dcmf_direct_rdma_registration,   &direct_rdma_config);
887     directcb.function=direct_send_rdma_done_cb;
888     directcb.clientdata=NULL;
889     /* put protocol */
890    DCMF_Put_Configuration_t put_configuration = { DCMF_DEFAULT_PUT_PROTOCOL };
891    DCMF_Put_register (&cmi_dcmf_direct_put_registration, &put_configuration);
892    DCMF_Get_Configuration_t get_configuration = { DCMF_DEFAULT_GET_PROTOCOL };
893    DCMF_Get_register (&cmi_dcmf_direct_get_registration, &get_configuration);
894     
895 #endif
896     //fprintf(stderr, "Initializing Eager Protocol\n");
897
898     _Cmi_numnodes = DCMF_Messager_size();
899     _Cmi_mynode = DCMF_Messager_rank();
900
901     unsigned rank = DCMF_Messager_rank();
902     unsigned size = DCMF_Messager_size();
903
904     CmiBarrier();
905     CmiBarrier();
906     CmiBarrier();
907
908     /* processor per node */
909     _Cmi_mynodesize = 1;
910     CmiGetArgInt(argv,"+ppn", &_Cmi_mynodesize);
911 #if ! CMK_SMP
912     if (_Cmi_mynodesize > 1 && _Cmi_mynode == 0)
913         CmiAbort("+ppn cannot be used in non SMP version!\n");
914 #endif
915
916     _Cmi_numpes = _Cmi_numnodes * _Cmi_mynodesize;
917     Cmi_nodestart = _Cmi_mynode * _Cmi_mynodesize;
918     Cmi_argvcopy = CmiCopyArgs(argv);
919     Cmi_argv = argv;
920     Cmi_startfn = fn;
921     Cmi_usrsched = usched;
922
923     //printf ("Starting Charm with %d nodes and %d processors\n", CmiNumNodes(), CmiNumPes());
924
925     DCMF_Multicast_Configuration_t mconfig;
926     mconfig.protocol = DCMF_MEMFIFO_DMA_MSEND_PROTOCOL;
927     mconfig.cb_recv  = first_multi_pkt_recv_done;
928     mconfig.clientdata = NULL;
929     mconfig.connectionlist = (void **) malloc (CmiNumPes() * sizeof(unsigned long));
930     mconfig.nconnections = CmiNumPes();  
931     DCMF_Multicast_register(&cmi_dcmf_multicast_registration, &mconfig);
932
933
934     /* find dim = log2(numpes), to pretend we are a hypercube */
935     for ( Cmi_dim=0,n=_Cmi_numpes; n>1; n/=2 )
936         Cmi_dim++ ;
937
938
939     /* checksum flag */
940     if (CmiGetArgFlag(argv,"+checksum")) {
941 #if CMK_ERROR_CHECKING
942         checksum_flag = 1;
943         if (_Cmi_mynode == 0) CmiPrintf("Charm++: CheckSum checking enabled! \n");
944 #else
945         if (_Cmi_mynode == 0) CmiPrintf("Charm++: +checksum ignored in optimized version! \n");
946 #endif
947     }
948
949     CsvInitialize(CmiNodeState, NodeState);
950     CmiNodeStateInit(&CsvAccess(NodeState));
951
952 #if CMK_NODE_QUEUE_AVAILABLE
953     CsvInitialize(PCQueue, node_bcastq);
954     CsvAccess(node_bcastq) = PCQueueCreate();
955     CsvInitialize(CmiNodeLock, node_bcastLock);
956     CsvAccess(node_bcastLock) = CmiCreateLock();
957 #endif
958
959     int actualNodeSize = _Cmi_mynodesize;
960 #if !CMK_MULTICORE
961     actualNodeSize++; //considering the extra comm thread
962 #endif
963
964     procState = (ProcState *)CmiAlloc((actualNodeSize) * sizeof(ProcState));
965     for (i=0; i<actualNodeSize; i++) {
966         /*    procState[i].sendMsgBuf = PCQueueCreate();   */
967         procState[i].recvLock = CmiCreateLock();
968         procState[i].bcastLock = CmiCreateLock();
969     }
970
971 #if CMK_SMP && !CMK_MULTICORE
972     commThdExitLock = CmiCreateLock();
973 #endif
974
975     /* Network progress function is used to poll the network when for
976        messages. This flushes receive buffers on some  implementations*/
977     networkProgressPeriod = PROGRESS_PERIOD;
978     CmiGetArgInt(argv, "+networkProgressPeriod", &networkProgressPeriod);
979
980     //printf ("Starting Threads\n");
981
982     CmiStartThreads(argv);
983
984     ConverseRunPE(initret);
985 }
986
987
988 int PerrorExit (char *err) {
989     fprintf (stderr, "err\n\n");
990     exit (-1);
991     return -1;
992 }
993
994
995 void ConverseRunPE(int everReturn) {
996     //printf ("ConverseRunPE on rank %d\n", CmiMyPe());
997
998     CmiIdleState *s=CmiNotifyGetState();
999     CmiState cs;
1000     char** CmiMyArgv;
1001     CmiNodeAllBarrier();
1002
1003     cs = CmiGetState();
1004     CpvInitialize(void *,CmiLocalQueue);
1005     CpvAccess(CmiLocalQueue) = cs->localqueue;
1006
1007     if (CmiMyRank())
1008         CmiMyArgv=CmiCopyArgs(Cmi_argvcopy);
1009     else
1010         CmiMyArgv=Cmi_argv;
1011
1012     CthInit(CmiMyArgv);
1013
1014     /* initialize the network progress counter*/
1015     /* Network progress function is used to poll the network when for
1016        messages. This flushes receive buffers on some  implementations*/
1017     CpvInitialize(int , networkProgressCount);
1018     CpvAccess(networkProgressCount) = 0;
1019
1020     CpvInitialize(PCQueue, broadcast_q);
1021     CpvAccess(broadcast_q) = PCQueueCreate();
1022
1023     CpvInitialize(PCQueue, smsg_list_q);
1024     CpvAccess(smsg_list_q) = PCQueueCreate();
1025
1026     //printf ("Before Converse Common Init\n");
1027     ConverseCommonInit(CmiMyArgv);
1028
1029     CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyIdle,NULL);
1030
1031     CmiBarrier();
1032
1033     /* Converse initialization finishes, immediate messages can be processed.
1034        node barrier previously should take care of the node synchronization */
1035     _immediateReady = 1;
1036
1037     /* communication thread */
1038     if (CmiMyRank() == CmiMyNodeSize()) {
1039         Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
1040         while (1) CommunicationServer(5);
1041     } else {
1042         //printf ("Calling Start Fn and the scheduler \n");
1043
1044         if (!everReturn) {
1045             Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
1046             if (Cmi_usrsched==0) CsdScheduler(-1);
1047             ConverseExit();
1048         }
1049     }
1050 }
1051
1052 #if CMK_SMP
1053 static int inexit = 0;
1054
1055 /* test if all processors recv queues are empty */
1056 static int RecvQueueEmpty() {
1057     int i;
1058     for (i=0; i<_Cmi_mynodesize; i++) {
1059         CmiState cs=CmiGetStateN(i);
1060         if (!PCQueueEmpty(cs->recv)) return 0;
1061     }
1062     return 1;
1063 }
1064
1065 #endif
1066
1067
1068 //extern void DCMF_Messager_dumpTimers();
1069
1070 void ConverseExit(void) {
1071
1072     while (msgQueueLen > 0 || outstanding_recvs > 0) {
1073         AdvanceCommunications();
1074     }
1075
1076     CmiNodeBarrier();
1077     ConverseCommonExit();
1078
1079     //  if(CmiMyPe()%101 == 0)
1080     //DCMF_Messager_dumpTimers();
1081
1082     if (CmiMyPe() == 0) {
1083         printf("End of program\n");
1084     }
1085
1086     CmiNodeBarrier();
1087 //  CmiNodeAllBarrier ();
1088
1089 #if CMK_SMP && !CMK_MULTICORE
1090     //CmiLock(commThdExitLock);
1091     commThdExit = 1;
1092     //CmiUnlock(commThdExitLock);
1093
1094     _bgp_msync();
1095 #endif
1096
1097     int rank0 = 0;
1098
1099     if (CmiMyRank() == 0) {
1100         rank0 = 1;
1101         //CmiFree(procState);
1102         DCMF_Messager_finalize();
1103     }
1104
1105     CmiNodeBarrier();
1106 //  CmiNodeAllBarrier ();
1107
1108     if (rank0)
1109         exit(0);
1110     else
1111         pthread_exit(NULL);
1112 }
1113
1114 /* exit() called on any node would abort the whole program */
1115 void CmiAbort(const char * message) {
1116     CmiError("------------- Processor %d Exiting: Called CmiAbort ------------\n"
1117              "{snd:%d,rcv:%d} Reason: %s\n",CmiMyPe(),
1118              msgQueueLen, outstanding_recvs, message);
1119     //CmiPrintStackTrace(0);
1120
1121     while (msgQueueLen > 0 || outstanding_recvs > 0) {
1122         AdvanceCommunications();
1123     }
1124
1125     CmiBarrier();
1126     assert (0);
1127 }
1128
1129 static void CommunicationServer(int sleepTime) {
1130 #if CMK_SMP && !CMK_MULTICORE
1131     //CmiLock(commThdExitLock);
1132     if (commThdExit) {
1133         while (msgQueueLen > 0 || outstanding_recvs > 0) {
1134             AdvanceCommunications();
1135         }
1136         CmiUnlock(commThdExitLock);
1137         pthread_exit(NULL);
1138         return;
1139     }
1140     //CmiUnlock(commThdExitLock);
1141 #endif
1142     AdvanceCommunications();
1143
1144 #if CMK_IMMEDIATE_MSG && CMK_SMP
1145     CmiHandleImmediate();
1146 #endif
1147
1148     //mysleep(sleepTime);
1149 }
1150
1151 static void CommunicationServerThread(int sleepTime) {
1152 #if CMK_SMP
1153     CommunicationServer(sleepTime);
1154 #endif
1155 //immediate msgs are handled in AdvanceCommunications
1156 }
1157
1158 #if CMK_NODE_QUEUE_AVAILABLE
1159 char *CmiGetNonLocalNodeQ(void) {
1160     CmiState cs = CmiGetState();
1161     char *result = 0;
1162     CmiIdleLock_checkMessage(&cs->idle);
1163     if (!PCQueueEmpty(CsvAccess(NodeState).NodeRecv)) {
1164         MACHSTATE1(3,"CmiGetNonLocalNodeQ begin %d {", CmiMyPe());
1165
1166         if (CmiTryLock(CsvAccess(NodeState).CmiNodeRecvLock) == 0) {
1167             //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
1168             result = (char *) PCQueuePop(CsvAccess(NodeState).NodeRecv);
1169             CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
1170         }
1171
1172         MACHSTATE1(3,"} CmiGetNonLocalNodeQ end %d ", CmiMyPe());
1173     }
1174     return result;
1175 }
1176 #endif
1177
1178
1179 void *CmiGetNonLocal() {
1180
1181     CmiState cs = CmiGetState();
1182
1183     void *msg = NULL;
1184     CmiIdleLock_checkMessage(&cs->idle);
1185     /* although it seems that lock is not needed, I found it crashes very often
1186        on mpi-smp without lock */
1187
1188 #if !CMK_SMP || CMK_MULTICORE  /*|| !BCASTMSG_ONLY_TO_COMMTHD*/
1189 //ChaoMei changes
1190     AdvanceCommunications();
1191 #endif
1192
1193     /*if(CmiMyRank()==0) printf("Got stuck here on proc[%d] node[%d]\n", CmiMyPe(), CmiMyNode());*/
1194
1195     if (PCQueueLength(cs->recv)==0) return NULL;
1196
1197 #if CMK_SMP
1198     CmiLock(procState[cs->rank].recvLock);
1199 #endif
1200
1201     msg =  PCQueuePop(cs->recv);
1202
1203 #if CMK_SMP
1204     CmiUnlock(procState[cs->rank].recvLock);
1205 #endif
1206
1207     return msg;
1208 }
1209
1210 static void CmiSendSelf(char *msg) {
1211 #if CMK_IMMEDIATE_MSG
1212     if (CmiIsImmediate(msg)) {
1213         /* CmiBecomeNonImmediate(msg); */
1214         //printf("In SendSelf, N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
1215         CmiPushImmediateMsg(msg);
1216         CmiHandleImmediate();
1217         return;
1218     }
1219 #endif
1220     
1221     CdsFifo_Enqueue(CpvAccess(CmiLocalQueue),msg);
1222 }
1223
1224 #if CMK_SMP
1225 static void CmiSendPeer (int rank, int size, char *msg) {
1226 #if CMK_BROADCAST_SPANNING_TREE | CMK_BROADCAST_HYPERCUBE
1227     if (CMI_BROADCAST_ROOT(msg) != 0) {
1228         char *copymsg;
1229         copymsg = (char *)CmiAlloc(size);
1230         CmiMemcpy(copymsg,msg,size);
1231
1232         CmiLock(procState[rank].bcastLock);
1233         PCQueuePush(CpvAccessOther(broadcast_q, rank), copymsg);
1234         CmiUnlock(procState[rank].bcastLock);
1235     }
1236 #endif
1237     
1238     CmiPushPE (rank, msg);
1239 }
1240 #endif
1241
1242
1243 void machineSend(SMSG_LIST *msg_tmp) {    
1244
1245     if (msg_tmp->destpe == CmiMyNode())
1246         CmiAbort("Sending to self\n");
1247
1248     CmiAssert(msg_tmp->destpe >= 0 && msg_tmp->destpe < CmiNumNodes());
1249     msg_tmp->cb.function     =   send_done;
1250     msg_tmp->cb.clientdata   =   msg_tmp;
1251
1252     DCMF_Protocol_t *protocol = NULL;
1253
1254     if (msg_tmp->size < 224)
1255         protocol = &cmi_dcmf_short_registration;
1256     else if (msg_tmp->size < 2048)
1257         protocol = &cmi_dcmf_eager_registration;
1258     else
1259         protocol = &cmi_dcmf_rzv_registration;
1260
1261
1262 #if CMK_SMP
1263     DCMF_CriticalSection_enter (0);
1264 #endif
1265
1266     msgQueueLen ++;
1267     DCMF_Send (protocol, &msg_tmp->send, msg_tmp->cb,
1268                DCMF_MATCH_CONSISTENCY, msg_tmp->destpe,
1269                msg_tmp->size, msg_tmp->msg, &msg_tmp->info, 1);
1270
1271 /*    
1272     #if CMK_SMP && !CMK_MULTICORE
1273     //Adding this advance call here improves the SMP performance
1274     //a little bit although it is possible that some more bugs are
1275     //introduced
1276     DCMF_Messager_advance();
1277     #endif
1278 */
1279
1280 #if CMK_SMP
1281     DCMF_CriticalSection_exit (0);
1282 #endif
1283 }
1284
1285 #define MAX_MULTICAST 128
1286 DCMF_Opcode_t  CmiOpcodeList [MAX_MULTICAST];
1287
1288 void  machineMulticast(int npes, int *pelist, int size, char* msg){  
1289   CQdCreate(CpvAccess(cQdState), npes);
1290   
1291   CmiAssert (npes < MAX_MULTICAST);
1292
1293   CMI_MAGIC(msg) = CHARM_MAGIC_NUMBER;
1294   ((CmiMsgHeaderBasic *)msg)->size = size;  
1295   CMI_SET_BROADCAST_ROOT(msg,0);
1296   CMI_SET_CHECKSUM(msg, size);
1297   
1298   SMSG_LIST *msg_tmp = smsg_allocate(); //(SMSG_LIST *) malloc(sizeof(SMSG_LIST));
1299   
1300   msg_tmp->destpe    = -1;      //multicast operation
1301   msg_tmp->size      = size * npes; //keep track of #bytes outstanding
1302   msg_tmp->msg       = msg;
1303   msg_tmp->pelist    = pelist;
1304   
1305   msgQueueLen ++;
1306   
1307   DCMF_Multicast_t  mcast_info __attribute__((__aligned__(16)));
1308   
1309   mcast_info.registration   = & cmi_dcmf_multicast_registration;
1310   mcast_info.request        = & msg_tmp->send;
1311   mcast_info.cb_done.function    =   send_multi_done;
1312   mcast_info.cb_done.clientdata  =   msg_tmp;
1313   mcast_info.consistency    =   DCMF_MATCH_CONSISTENCY;
1314   mcast_info.connection_id  =   CmiMyPe();
1315   mcast_info.bytes          =   size;
1316   mcast_info.src            =   msg;
1317   mcast_info.nranks         =   npes;
1318   mcast_info.ranks          =   (unsigned *)pelist;
1319   mcast_info.opcodes        =   CmiOpcodeList;   //static list of MAX_MULTICAST entires with 0 in them
1320   mcast_info.flags          =   0;
1321   mcast_info.msginfo        =   &msg_tmp->info;
1322   mcast_info.count          =   1;
1323
1324   DCMF_Multicast (&mcast_info);
1325 }
1326
1327
1328
1329 void CmiGeneralFreeSendN (int node, int rank, int size, char * msg);
1330
1331 /* The general free send function
1332  * Send is synchronous, and free msg after posted
1333  */
1334 void  CmiGeneralFreeSend(int destPE, int size, char* msg) {
1335
1336   if (destPE < 0 || destPE > CmiNumPes ())
1337     printf ("Sending to %d\n", destPE);
1338
1339   CmiAssert (destPE >= 0 && destPE < CmiNumPes());
1340
1341     CmiState cs = CmiGetState();
1342
1343     if (destPE==cs->pe) {
1344         CmiSendSelf(msg);
1345         return;
1346     }
1347
1348 #if CMK_SMP && !CMK_MULTICORE && BCASTMSG_ONLY_TO_COMMTHD
1349     //In the presence of comm thd which is responsible for sending bcast msgs, then
1350     //CmiNodeOf and CmiRankOf may return incorrect information if the destPE is considered
1351     //as a comm thd.
1352     if (destPE >= _Cmi_numpes) { //destination is a comm thd
1353         int nid = destPE - _Cmi_numpes;
1354         int rid = _Cmi_mynodesize;
1355         CmiGeneralFreeSendN (nid, rid, size, msg);
1356     } else {
1357         CmiGeneralFreeSendN (CmiNodeOf (destPE), CmiRankOf (destPE), size, msg);
1358     }
1359 #else
1360     //printf ("%d: Sending Message to %d \n", CmiMyPe(), destPE);
1361     CmiGeneralFreeSendN (CmiNodeOf (destPE), CmiRankOf (destPE), size, msg);
1362 #endif
1363 }
1364
1365 void CmiGeneralFreeSendN (int node, int rank, int size, char * msg) {
1366
1367     //printf ("%d, %d: Sending Message to node %d rank %d \n", CmiMyPe(),
1368     //  CmiMyNode(), node, rank);
1369
1370 #if CMK_SMP
1371     CMI_DEST_RANK(msg) = rank;
1372     //CMI_SET_CHECKSUM(msg, size);
1373
1374     if (node == CmiMyNode()) {
1375         CmiSendPeer (rank, size, msg);
1376         return;
1377     }
1378 #endif
1379
1380     SMSG_LIST *msg_tmp = smsg_allocate(); //(SMSG_LIST *) malloc(sizeof(SMSG_LIST));
1381     msg_tmp->destpe = node; //destPE;
1382     msg_tmp->size = size;
1383     msg_tmp->msg = msg;
1384
1385     machineSend(msg_tmp);
1386 }
1387
1388 void CmiSyncSendFn(int destPE, int size, char *msg) {
1389     char *copymsg;
1390     copymsg = (char *)CmiAlloc(size);
1391     CmiMemcpy(copymsg,msg,size);
1392     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncSendFn on comm thd on node %d\n", CmiMyNode());
1393     CmiFreeSendFn(destPE,size,copymsg);
1394 }
1395
1396 void CmiFreeSendFn(int destPE, int size, char *msg) {    
1397     CQdCreate(CpvAccess(cQdState), 1);
1398     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeSendFn on comm thd on node %d\n", CmiMyNode());
1399
1400     CMI_SET_BROADCAST_ROOT(msg,0);
1401     CMI_MAGIC(msg) = CHARM_MAGIC_NUMBER;
1402     ((CmiMsgHeaderBasic *)msg)->size = size;
1403     CMI_SET_CHECKSUM(msg, size);
1404
1405     CmiGeneralFreeSend(destPE,size,msg);
1406 }
1407
1408 /* same as CmiSyncSendFn, but don't set broadcast root in msg header */
1409 void CmiSyncSendFn1(int destPE, int size, char *msg) {
1410     char *copymsg;
1411     copymsg = (char *)CmiAlloc(size);
1412     CmiMemcpy(copymsg, msg, size);
1413
1414     //  asm volatile("sync" ::: "memory");
1415
1416     CMI_MAGIC(copymsg) = CHARM_MAGIC_NUMBER;
1417     ((CmiMsgHeaderBasic *)copymsg)->size = size;
1418     CMI_SET_CHECKSUM(copymsg, size);
1419
1420     CmiGeneralFreeSend(destPE,size,copymsg);
1421 }
1422
1423 #define NODE_LEVEL_ST_IMPLEMENTATION 1
1424 #if NODE_LEVEL_ST_IMPLEMENTATION
1425 //send msgs to other ranks except the rank specified in the argument
1426 static void CmiSendChildrenPeers(int rank, int size, char *msg) {
1427     //printf ("%d [%d]: Send children peers except rank %d\n",  CmiMyPe(), CmiMyNode(), CmiMyRank());
1428     int r=0;
1429
1430 //With comm thd, broadcast msg will be pulled from bcast queue from the comm thd
1431 //And the msg would be finally pushed into other cores on the same node by the
1432 //comm thd. But it's possible a msg has already been pushed into rank 0 in
1433 //recv_done func call. So there's no need to push the bcast msg into the recv
1434 //queue again in the case BCASTMSG_ONLY_TO_COMMTHD is not set
1435
1436 #if !CMK_MULTICORE && !BCASTMSG_ONLY_TO_COMMTHD
1437     //indicate this is called from comm thread.
1438     if (rank == _Cmi_mynodesize) r = 1;
1439 #endif
1440
1441     for (;r<rank; r++) {
1442         char *copymsg;
1443         copymsg = (char *)CmiAlloc(size);
1444         CmiMemcpy(copymsg,msg,size);        
1445         CmiPushPE (r, copymsg);
1446     }
1447
1448     for (r=rank+1; r<_Cmi_mynodesize; r++) {
1449         char *copymsg;
1450         copymsg = (char *)CmiAlloc(size);
1451         CmiMemcpy(copymsg,msg,size);        
1452         CmiPushPE (r, copymsg);
1453     }
1454 }
1455
1456 //In this implementation, msgs are first sent out to the comm thd or rank 0 of other nodes
1457 //then send msgs to the other cores on the same node
1458 void SendSpanningChildren(int size, char *msg) {
1459     CmiState cs = CmiGetState();
1460     int startpe = CMI_BROADCAST_ROOT(msg)-1;
1461     int i;
1462
1463     //printf ("%d [%d]: In Send Spanning Tree with startpe %d\n",  CmiMyPe(), CmiMyNode(), startpe);
1464
1465     CmiAssert(startpe>=0 && startpe<_Cmi_numpes);
1466
1467     int startNid = CmiNodeOf(startpe);
1468     int thisNid, thisRid;
1469     thisNid = CmiMyNode();
1470     thisRid = CmiMyRank();
1471
1472     //printf ("%d [%d/%d]: In Send Spanning Tree with startpe %d\n",  CmiMyPe(), CmiMyNode(), thisRid, startpe);
1473
1474     //Step1: send to cores that has comm thd on other nodes
1475     int dist = thisNid - startNid;
1476     if (dist<0) dist += _Cmi_numnodes;
1477     for (i=1; i <= BROADCAST_SPANNING_FACTOR; i++) {
1478         int nid = BROADCAST_SPANNING_FACTOR*dist + i;
1479         if (nid > _Cmi_numnodes - 1) break;
1480         nid += startNid;
1481         nid = nid%_Cmi_numnodes;
1482         CmiAssert(nid>=0 && nid<_Cmi_numnodes && nid!=thisNid);
1483 #if CMK_SMP && !CMK_MULTICORE && BCASTMSG_ONLY_TO_COMMTHD
1484         int p = nid + _Cmi_numpes;
1485 #else
1486         int p = CmiNodeFirst(nid);
1487 #endif
1488         //printf ("%d [%d]: Sending Spanning Tree Msg to %d\n",  CmiMyPe(), CmiMyNode(), p);
1489         CmiSyncSendFn1(p, size, msg);
1490     }
1491
1492     //Step2: send to other cores (i.e. excluding myself cs->pe) on the same nodes (just a flat send)
1493     CmiSendChildrenPeers(thisRid, size, msg);
1494
1495 #if !CMK_SMP
1496 #if ENABLE_BROADCAST_THROTTLE
1497     SendMsgsUntil (0);
1498 #endif
1499 #endif
1500 }
1501 #else
1502 /* send msg to its spanning children in broadcast. G. Zheng */
1503 void SendSpanningChildren(int size, char *msg) {
1504     CmiState cs = CmiGetState();
1505     int startpe = CMI_BROADCAST_ROOT(msg)-1;
1506     int i;
1507
1508     //printf ("%d [%d]: In Send Spanning Tree\n",  CmiMyPe(), CmiMyNode());
1509
1510     CmiAssert(startpe>=0 && startpe<_Cmi_numpes);
1511     int dist = cs->pe-startpe;
1512     if (dist<0) dist+=_Cmi_numpes;
1513     for (i=1; i <= BROADCAST_SPANNING_FACTOR; i++) {
1514         int p = BROADCAST_SPANNING_FACTOR*dist + i;
1515         if (p > _Cmi_numpes - 1) break;
1516         p += startpe;
1517         p = p%_Cmi_numpes;
1518         CmiAssert(p>=0 && p<_Cmi_numpes && p!=cs->pe);
1519
1520         //printf ("%d [%d]: Sending Spanning Tree Msg to %d\n",  CmiMyPe(), CmiMyNode(), p);
1521         CmiSyncSendFn1(p, size, msg);
1522     }    
1523 }
1524 #endif
1525
1526 /* send msg along the hypercube in broadcast. (Sameer) */
1527 void SendHypercube(int size, char *msg) {
1528     CmiState cs = CmiGetState();
1529     int curcycle = CMI_GET_CYCLE(msg);
1530     int i;
1531
1532     double logp = CmiNumPes();
1533     logp = log(logp)/log(2.0);
1534     logp = ceil(logp);
1535
1536     /*  CmiPrintf("In hypercube\n"); */
1537     /* assert(startpe>=0 && startpe<_Cmi_numpes); */
1538
1539     for (i = curcycle; i < logp; i++) {
1540         int p = cs->pe ^ (1 << i);
1541         /*   CmiPrintf("p = %d, logp = %5.1f\n", p, logp);*/
1542         if (p < CmiNumPes()) {
1543             CMI_SET_CYCLE(msg, i + 1);
1544
1545             CmiAssert(p>=0 && p<_Cmi_numpes && p!=cs->pe);
1546             CmiSyncSendFn1(p, size, msg);
1547         }
1548     }
1549 }
1550
1551 void CmiSyncBroadcastFn(int size, char *msg) {
1552     char *copymsg;
1553     copymsg = (char *)CmiAlloc(size);
1554     CmiMemcpy(copymsg,msg,size);
1555     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncBroadcastFn on comm thd on node %d\n", CmiMyNode());
1556     CmiFreeBroadcastFn(size,copymsg);
1557 }
1558
1559 void CmiFreeBroadcastFn(int size, char *msg) {
1560
1561     //printf("%d: Calling Broadcast %d\n", CmiMyPe(), size);
1562
1563     CmiState cs = CmiGetState();
1564 #if CMK_BROADCAST_SPANNING_TREE    
1565     CQdCreate(CpvAccess(cQdState), CmiNumPes()-1);
1566     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeBroadcastFn on comm thd on node %d\n", CmiMyNode());
1567
1568     //printf ("%d: Starting Spanning Tree Broadcast of size %d bytes\n", CmiMyPe(), size);
1569
1570     CMI_SET_BROADCAST_ROOT(msg, cs->pe+1);
1571     SendSpanningChildren(size, msg);
1572     CmiFree(msg);
1573 #elif CMK_BROADCAST_HYPERCUBE    
1574     CQdCreate(CpvAccess(cQdState), CmiNumPes()-1);
1575
1576     CMI_SET_CYCLE(msg, 0);
1577     SendHypercube(size, msg);
1578     CmiFree(msg);
1579 #else
1580     int i;
1581
1582     for ( i=cs->pe+1; i<_Cmi_numpes; i++ )
1583         CmiSyncSendFn(i,size,msg);
1584
1585     for ( i=0; i<cs->pe; i++ )
1586         CmiSyncSendFn(i,size,msg);
1587
1588     CmiFree(msg);
1589 #endif
1590 }
1591
1592 void CmiSyncBroadcastAllFn(int size, char *msg) {
1593     char *copymsg;
1594     copymsg = (char *)CmiAlloc(size);
1595     CmiMemcpy(copymsg,msg,size);
1596     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncBroadcastAllFn on comm thd on node %d\n", CmiMyNode());
1597     CmiFreeBroadcastAllFn(size,copymsg);
1598 }
1599
1600 void CmiFreeBroadcastAllFn(int size, char *msg) {
1601
1602     //printf("%d: Calling All Broadcast %d\n", CmiMyPe(), size);
1603
1604     CmiState cs = CmiGetState();
1605 #if CMK_BROADCAST_SPANNING_TREE
1606
1607     //printf ("%d: Starting Spanning Tree Broadcast of size %d bytes\n", CmiMyPe(), size);
1608     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeBroadcastAllFn on comm thd on node %d\n", CmiMyNode());
1609
1610     CmiSyncSendFn(cs->pe,size,msg);
1611     
1612     CQdCreate(CpvAccess(cQdState), CmiNumPes()-1);
1613
1614     CMI_SET_BROADCAST_ROOT(msg, cs->pe+1);
1615     SendSpanningChildren(size, msg);
1616     CmiFree(msg);
1617
1618 #elif CMK_BROADCAST_HYPERCUBE
1619     CmiSyncSendFn(cs->pe,size,msg);
1620     
1621     CQdCreate(CpvAccess(cQdState), CmiNumPes()-1);
1622
1623     CMI_SET_CYCLE(msg, 0);
1624     SendHypercube(size, msg);
1625     CmiFree(msg);
1626 #else
1627     int i ;
1628
1629     DCMF_CriticalSection_enter (0);
1630
1631     for ( i=0; i<_Cmi_numpes; i++ ) {
1632         CmiSyncSendFn(i,size,msg);
1633
1634         if ( (i % 32) == 0 )
1635             SendMsgsUntil (0);
1636     }
1637
1638     DCMF_CriticalSection_exit (0);
1639
1640     CmiFree(msg);
1641 #endif
1642 }
1643
1644 void AdvanceCommunications() {
1645
1646 #if CMK_SMP
1647     DCMF_CriticalSection_enter (0);
1648 #endif
1649
1650     while(DCMF_Messager_advance()>0);
1651     //DCMF_Messager_advance();
1652
1653 #if CMK_SMP
1654     DCMF_CriticalSection_exit (0);
1655 #endif
1656
1657     sendBroadcastMessages();
1658 #if CMK_NODE_QUEUE_AVAILABLE
1659     sendBroadcastMessagesNode();
1660 #endif
1661
1662 #if CMK_IMMEDIATE_MSG && !CMK_SMP
1663     CmiHandleImmediate();
1664 #endif
1665 }
1666
1667
1668 static void SendMsgsUntil(int targetm) {
1669
1670     while (msgQueueLen>targetm) {
1671       //AdvanceCommunications ();
1672 #if CMK_SMP
1673       DCMF_CriticalSection_enter (0);
1674 #endif
1675       
1676       while(DCMF_Messager_advance()>0);
1677       //DCMF_Messager_advance();
1678       
1679 #if CMK_SMP
1680       DCMF_CriticalSection_exit (0);
1681 #endif
1682     }
1683 }
1684
1685 void CmiNotifyIdle() {
1686 #if !CMK_SMP || CMK_MULTICORE
1687     AdvanceCommunications();
1688 #endif
1689 }
1690
1691
1692 /*==========================================================*/
1693 /*==========================================================*/
1694 /*==========================================================*/
1695
1696 /************ Recommended routines ***********************/
1697 /************ You dont have to implement these but they are supported
1698  in the converse syntax and some rare programs may crash. But most
1699  programs dont need them. *************/
1700
1701 CmiCommHandle CmiAsyncSendFn(int dest, int size, char *msg) {
1702     CmiAbort("CmiAsyncSendFn not implemented.");
1703     return (CmiCommHandle) 0;
1704 }
1705
1706 CmiCommHandle CmiAsyncBroadcastFn(int size, char *msg) {
1707     CmiAbort("CmiAsyncBroadcastFn not implemented.");
1708     return (CmiCommHandle) 0;
1709 }
1710
1711 CmiCommHandle CmiAsyncBroadcastAllFn(int size, char *msg) {
1712     CmiAbort("CmiAsyncBroadcastAllFn not implemented.");
1713     return (CmiCommHandle) 0;
1714 }
1715
1716 int           CmiAsyncMsgSent(CmiCommHandle handle) {
1717     CmiAbort("CmiAsyncMsgSent not implemented.");
1718     return 0;
1719 }
1720 void          CmiReleaseCommHandle(CmiCommHandle handle) {
1721     CmiAbort("CmiReleaseCommHandle not implemented.");
1722 }
1723
1724
1725 /*==========================================================*/
1726 /*==========================================================*/
1727 /*==========================================================*/
1728
1729 /* Optional routines which could use common code which is shared with
1730    other machine layer implementations. */
1731
1732 /* MULTICAST/VECTOR SENDING FUNCTIONS
1733
1734  * In relations to some flags, some other delivery functions may be needed.
1735  */
1736
1737 #if ! CMK_MULTICAST_LIST_USE_COMMON_CODE
1738
1739 void CmiSyncListSendFn(int npes, int *pes, int size, char *msg) {
1740     char *copymsg;
1741     copymsg = (char *)CmiAlloc(size);
1742     CmiMemcpy(copymsg,msg,size);
1743     CmiFreeListSendFn(npes, pes, size, msg);
1744 }
1745
1746 //#define OPTIMIZED_MULTICAST  0
1747
1748 void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) {
1749 #if CMK_SMP && !CMK_MULTICORE
1750     //DCMF_CriticalSection_enter (0);
1751 #endif
1752     CMI_SET_BROADCAST_ROOT(msg,0);
1753     CMI_MAGIC(msg) = CHARM_MAGIC_NUMBER;
1754     ((CmiMsgHeaderBasic *)msg)->size = size;
1755     CMI_SET_CHECKSUM(msg, size);
1756
1757     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeListSendFn on comm thd on node %d\n", CmiMyNode());
1758
1759     //printf("%d: In Free List Send Fn\n", CmiMyPe());
1760     int new_npes = 0;
1761
1762     int i, count = 0, my_loc = -1;
1763     for (i=0; i<npes; i++) {
1764         //if (pes[i] == CmiMyPe() || CmiNodeOf(pes[i]) == CmiMyNode()) {
1765         if (CmiNodeOf(pes[i]) == CmiMyNode()) {
1766             CmiSyncSend(pes[i], size, msg);
1767             //my_loc = i;
1768         }
1769     }
1770
1771 #if OPTIMIZED_MULTICAST && CMK_SMP
1772 #warning "Using Optimized Multicast"
1773     if (npes > 1) {    
1774       int *newpelist = (int *) malloc (sizeof(int) * npes);
1775       int new_npes = 0;
1776     
1777       for(i=0; i<npes; i++) {
1778         if(CmiNodeOf(pes[i]) == CmiMyNode()) 
1779           continue;
1780         else
1781           newpelist[new_npes++] = pes[i];
1782       }
1783
1784       if (new_npes >= 1)
1785         machineMulticast (new_npes, newpelist, size, msg);
1786       else
1787         CmiFree (msg);
1788       return;
1789     }
1790 #endif
1791
1792     for (i=0;i<npes;i++) {
1793         //if (pes[i] == CmiMyPe() || CmiNodeOf(pes[i]) == CmiMyNode());
1794         if (CmiNodeOf(pes[i]) == CmiMyNode());
1795         else if (i < npes - 1) {
1796 #if !CMK_SMP /*|| (CMK_SMP && !CMK_MULTICORE)*/
1797             CmiReference(msg);
1798             CmiGeneralFreeSend(pes[i], size, msg);
1799 #else
1800             CmiSyncSend(pes[i], size, msg);
1801 #endif
1802         }
1803     }
1804
1805     //if (npes  && (pes[npes-1] != CmiMyPe() && CmiNodeOf(pes[i]) != CmiMyNode()))
1806     if (npes  && CmiNodeOf(pes[npes-1]) != CmiMyNode())
1807         CmiSyncSendAndFree(pes[npes-1], size, msg); //Sameto CmiFreeSendFn
1808     else
1809         CmiFree(msg);
1810
1811     //AdvanceCommunications();
1812 #if CMK_SMP && !CMK_MULTICORE
1813     //DCMF_CriticalSection_exit (0);
1814 #endif
1815 }
1816
1817 CmiCommHandle CmiAsyncListSendFn(int npes, int *pes, int size, char *msg) {
1818     CmiAbort("CmiAsyncListSendFn not implemented.");
1819     return (CmiCommHandle) 0;
1820 }
1821 #endif
1822
1823 /** NODE SENDING FUNCTIONS
1824
1825  * If there is a node queue, and we consider also nodes as entity (tipically in
1826  * SMP versions), these functions are needed.
1827  */
1828
1829 #if CMK_NODE_QUEUE_AVAILABLE
1830
1831 void          CmiSyncNodeSendFn(int, int, char *);
1832 CmiCommHandle CmiAsyncNodeSendFn(int, int, char *);
1833 void          CmiFreeNodeSendFn(int, int, char *);
1834
1835 void          CmiSyncNodeBroadcastFn(int, char *);
1836 CmiCommHandle CmiAsyncNodeBroadcastFn(int, char *);
1837 void          CmiFreeNodeBroadcastFn(int, char *);
1838
1839 void          CmiSyncNodeBroadcastAllFn(int, char *);
1840 CmiCommHandle CmiAsyncNodeBroadcastAllFn(int, char *);
1841 void          CmiFreeNodeBroadcastAllFn(int, char *);
1842
1843 #endif
1844
1845
1846 #if CMK_SHARED_VARS_POSIX_THREADS_SMP
1847
1848 int CmiMyPe();
1849 int CmiMyRank();
1850 int CmiNodeFirst(int node);
1851 int CmiNodeSize(int node);
1852 int CmiNodeOf(int pe);
1853 int CmiRankOf(int pe);
1854
1855 int CmiMyPe(void) {
1856     return CmiGetState()->pe;
1857 }
1858
1859 int CmiMyRank(void) {
1860     return CmiGetState()->rank;
1861 }
1862
1863 int CmiNodeFirst(int node) {
1864     return node*_Cmi_mynodesize;
1865 }
1866 int CmiNodeSize(int node)  {
1867     return _Cmi_mynodesize;
1868 }
1869
1870 int CmiNodeOf(int pe)      {
1871     return (pe/_Cmi_mynodesize);
1872 }
1873 int CmiRankOf(int pe)      {
1874     return pe%_Cmi_mynodesize;
1875 }
1876
1877
1878 /* optional, these functions are implemented in "machine-smp.c", so including
1879    this file avoid the necessity to reimplement them.
1880  */
1881 void CmiNodeBarrier(void);
1882 void CmiNodeAllBarrier(void);
1883 CmiNodeLock CmiCreateLock();
1884 void CmiDestroyLock(CmiNodeLock lock);
1885
1886 #endif
1887
1888 /** IMMEDIATE MESSAGES
1889
1890  * If immediate messages are supported, the following function is needed. There
1891  * is an exeption if the machine progress is also defined (see later for this).
1892
1893  * Moreover, the file "immediate.c" should be included, otherwise all its
1894  * functions and variables have to be redefined.
1895 */
1896
1897 #if CMK_CCS_AVAILABLE
1898
1899 #include "immediate.c"
1900
1901 #if ! CMK_MACHINE_PROGRESS_DEFINED /* Hack for some machines */
1902 void CmiProbeImmediateMsg();
1903 #endif
1904
1905 #endif
1906
1907
1908 /** MACHINE PROGRESS DEFINED
1909
1910  * Some machines (like BlueGene/L) do not have coprocessors, and messages need
1911  * to be pulled out of the network manually. For this reason the following
1912  * functions are needed. Notice that the function "CmiProbeImmediateMsg" must
1913  * not be defined anymore.
1914  */
1915
1916 #if CMK_MACHINE_PROGRESS_DEFINED
1917
1918
1919
1920 void CmiMachineProgressImpl() {
1921
1922 #if !CMK_SMP
1923     AdvanceCommunications();
1924 #else
1925     /*Not implemented yet. Communication server does not seem to be
1926       thread safe */
1927 #endif
1928 }
1929
1930 #endif
1931
1932 /* Dummy implementation */
1933 extern int CmiBarrier() {
1934     //Use DCMF barrier later
1935 }
1936
1937 #if CMK_NODE_QUEUE_AVAILABLE
1938 static void CmiSendNodeSelf(char *msg) {
1939 #if CMK_IMMEDIATE_MSG
1940     if (CmiIsImmediate(msg)) {
1941         //printf("SendNodeSelf: N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
1942         CmiPushImmediateMsg(msg);
1943 #if CMK_MULTICORE
1944         CmiHandleImmediate();
1945 #endif
1946         return;
1947     }
1948 #endif    
1949     CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
1950     PCQueuePush(CsvAccess(NodeState).NodeRecv, msg);
1951     CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
1952 }
1953
1954 CmiCommHandle CmiAsyncNodeSendFn(int dstNode, int size, char *msg) {
1955     CmiAbort ("Async Node Send not supported\n");
1956 }
1957
1958 void CmiFreeNodeSendFn(int node, int size, char *msg) {
1959
1960     CMI_SET_BROADCAST_ROOT(msg,0);
1961     CMI_MAGIC(msg) = CHARM_MAGIC_NUMBER;
1962     ((CmiMsgHeaderBasic *)msg)->size = size;
1963     CMI_SET_CHECKSUM(msg, size);
1964     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeNodeSendFn on comm thd on node %d\n", CmiMyNode());
1965     
1966     CQdCreate(CpvAccess(cQdState), 1);
1967
1968     if (node == _Cmi_mynode) {
1969         CmiSendNodeSelf(msg);
1970     } else {
1971         CmiGeneralFreeSendN(node, SMP_NODEMESSAGE, size, msg);
1972     }
1973 }
1974
1975 void CmiSyncNodeSendFn(int p, int s, char *m) {
1976     char *dupmsg;
1977     dupmsg = (char *)CmiAlloc(s);
1978     CmiMemcpy(dupmsg,m,s);
1979     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncNodeSendFn on comm thd on node %d\n", CmiMyNode());
1980     CmiFreeNodeSendFn(p, s, dupmsg);
1981 }
1982
1983 CmiCommHandle CmiAsyncNodeBroadcastFn(int s, char *m) {
1984     return NULL;
1985 }
1986
1987 void SendSpanningChildrenNode(int size, char *msg) {
1988     int startnode = -CMI_BROADCAST_ROOT(msg)-1;
1989     //printf("on node %d rank %d, send node spanning children with root %d\n", CmiMyNode(), CmiMyRank(), startnode);
1990     assert(startnode>=0 && startnode<CmiNumNodes());
1991
1992     int dist = CmiMyNode()-startnode;
1993     if (dist<0) dist += CmiNumNodes();
1994     int i;
1995     for (i=1; i <= BROADCAST_SPANNING_FACTOR; i++) {
1996         int nid = BROADCAST_SPANNING_FACTOR*dist + i;
1997         if (nid > CmiNumNodes() - 1) break;
1998         nid += startnode;
1999         nid = nid%CmiNumNodes();
2000         assert(nid>=0 && nid<CmiNumNodes() && nid!=CmiMyNode());
2001         char *dupmsg = (char *)CmiAlloc(size);
2002         CmiMemcpy(dupmsg,msg,size);
2003         //printf("In SendSpanningChildrenNode, sending bcast msg (root %d) from node %d to node %d\n", startnode, CmiMyNode(), nid);
2004         CmiGeneralFreeSendN(nid, SMP_NODEMESSAGE, size, dupmsg);
2005     }
2006 }
2007
2008 /* need */
2009 void CmiFreeNodeBroadcastFn(int s, char *m) {
2010 #if CMK_BROADCAST_SPANNING_TREE
2011     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeNodeBcastFn on comm thd on node %d\n", CmiMyNode());
2012     
2013     CQdCreate(CpvAccess(cQdState), CmiNumNodes()-1);
2014
2015     int mynode = CmiMyNode();
2016     CMI_SET_BROADCAST_ROOT(m, -mynode-1);
2017     CMI_MAGIC(m) = CHARM_MAGIC_NUMBER;
2018     ((CmiMsgHeaderBasic *)m)->size = s;
2019     CMI_SET_CHECKSUM(m, s);
2020     //printf("In CmiFreeNodeBroadcastFn, sending bcast msg from root node %d\n", CMI_BROADCAST_ROOT(m));
2021
2022     SendSpanningChildrenNode(s, m);
2023 #else
2024     int i;
2025     for (i=0; i<CmiNumNodes(); i++) {
2026         if (i==CmiMyNode()) continue;
2027         char *dupmsg = (char *)CmiAlloc(s);
2028         CmiMemcpy(dupmsg,m,s);
2029         CmiFreeNodeSendFn(i, s, dupmsg);
2030     }
2031 #endif
2032     CmiFree(m);    
2033 }
2034
2035 void CmiSyncNodeBroadcastFn(int s, char *m) {
2036     char *dupmsg;
2037     dupmsg = (char *)CmiAlloc(s);
2038     CmiMemcpy(dupmsg,m,s);
2039     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncNodeBcastFn on comm thd on node %d\n", CmiMyNode());
2040     CmiFreeNodeBroadcastFn(s, dupmsg);
2041 }
2042
2043 /* need */
2044 void CmiFreeNodeBroadcastAllFn(int s, char *m) {
2045     char *dupmsg = (char *)CmiAlloc(s);
2046     CmiMemcpy(dupmsg,m,s);
2047     CMI_MAGIC(dupmsg) = CHARM_MAGIC_NUMBER;
2048     ((CmiMsgHeaderBasic *)dupmsg)->size = s;
2049     CMI_SET_CHECKSUM(dupmsg, s);
2050
2051     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeNodeBcastAllFn on comm thd on node %d\n", CmiMyNode());
2052     
2053     CQdCreate(CpvAccess(cQdState), 1);
2054     CmiSendNodeSelf(dupmsg);
2055
2056     CmiFreeNodeBroadcastFn(s, m);
2057 }
2058
2059 void CmiSyncNodeBroadcastAllFn(int s, char *m) {
2060     char *dupmsg;
2061     dupmsg = (char *)CmiAlloc(s);
2062     CmiMemcpy(dupmsg,m,s);
2063     //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncNodeBcastAllFn on comm thd on node %d\n", CmiMyNode());
2064     CmiFreeNodeBroadcastAllFn(s, dupmsg);
2065 }
2066
2067
2068 CmiCommHandle CmiAsyncNodeBroadcastAllFn(int s, char *m) {
2069     return NULL;
2070 }
2071 #endif //end of CMK_NODE_QUEUE_AVAILABLE
2072
2073 #include "manytomany.c"
2074
2075
2076 /*********************************************************************************************
2077 This section is for CmiDirect. This is a variant of the  persistent communication in which
2078 the user can transfer data between processors without using Charm++ messages. This lets the user
2079 send and receive data from the middle of his arrays without any copying on either send or receive
2080 side
2081 *********************************************************************************************/
2082
2083
2084
2085
2086 #ifdef BGP_USE_AM_DIRECT
2087
2088 #include "cmidirect.h"
2089
2090 /* We can avoid a receiver side lookup by just sending the whole shebang.
2091    DCMF header is in units of quad words (16 bytes), so we'd need less than a
2092    quad word for the handle if we just sent that and did a lookup. Or exactly
2093    2 quad words for the buffer pointer, callback pointer, callback
2094    data pointer, and DCMF_Request_t pointer with no lookup.
2095
2096    Since CmiDirect is generally going to be used for messages which aren't
2097    tiny, the extra 16 bytes is not likely to impact performance noticably and
2098    not having to lookup handles in tables simplifies the code enormously.
2099
2100    EJB   2008/4/2
2101 */
2102
2103
2104 /**
2105  To be called on the receiver to create a handle and return its number
2106 **/
2107 struct infiDirectUserHandle CmiDirect_createHandle(int senderNode,void *recvBuf, int recvBufSize, void (*callbackFnPtr)(void *), void *callbackData,double initialValue) {
2108     /* with two-sided primitives we just bundle the buffer and callback info into the handle so the sender can remind us about it later. */
2109     struct infiDirectUserHandle userHandle;
2110     userHandle.handle=1; /* doesn't matter on BG/P*/
2111     userHandle.senderNode=senderNode;
2112     userHandle.recverNode=_Cmi_mynode;
2113     userHandle.recverBufSize=recvBufSize;
2114     userHandle.recverBuf=recvBuf;
2115     userHandle.initialValue=initialValue;
2116     userHandle.callbackFnPtr=callbackFnPtr;
2117     userHandle.callbackData=callbackData;
2118     userHandle.DCMF_rq_trecv=(DCMF_Request_t *) ALIGN_16(CmiAlloc(sizeof(DCMF_Request_t)+16));
2119 #if CMI_DIRECT_DEBUG
2120     CmiPrintf("[%d] RDMA create addr %p %d callback %p callbackdata %p\n",CmiMyPe(),userHandle.recverBuf,userHandle.recverBufSize, userHandle.callbackFnPtr, userHandle.callbackData);
2121 #endif
2122     return userHandle;
2123 }
2124
2125 /****
2126  To be called on the sender to attach the sender's buffer to this handle
2127 ******/
2128
2129 void CmiDirect_assocLocalBuffer(struct infiDirectUserHandle *userHandle,void *sendBuf,int sendBufSize) {
2130
2131     /* one-sided primitives would require registration of memory */
2132
2133     /* with two-sided primitives we just record the sender buf in the handle */
2134     userHandle->senderBuf=sendBuf;
2135     CmiAssert(sendBufSize==userHandle->recverBufSize);
2136     userHandle->DCMF_rq_tsend = (DCMF_Request_t *) ALIGN_16(CmiAlloc(sizeof(DCMF_Request_t)+16));
2137 #if CMI_DIRECT_DEBUG
2138     CmiPrintf("[%d] RDMA assoc addr %p %d to receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,sendBufSize, userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2139 #endif
2140
2141 }
2142
2143 /****
2144 To be called on the sender to do the actual data transfer
2145 ******/
2146 void CmiDirect_put(struct infiDirectUserHandle *userHandle) {
2147     /** invoke a DCMF_Send with the direct callback */
2148     DCMF_Protocol_t *protocol = NULL;
2149     protocol = &cmi_dcmf_direct_registration;
2150     /* local copy */
2151     CmiAssert(userHandle->recverBuf!=NULL);
2152     CmiAssert(userHandle->senderBuf!=NULL);
2153     CmiAssert(userHandle->recverBufSize>0);
2154     if (userHandle->recverNode== _Cmi_mynode) {
2155 #if CMI_DIRECT_DEBUG
2156         CmiPrintf("[%d] RDMA local put addr %p %d to recverNode %d receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,userHandle->recverBufSize, userHandle->recverNode,userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2157 #endif
2158
2159         CmiMemcpy(userHandle->recverBuf,userHandle->senderBuf,userHandle->recverBufSize);
2160         (*(userHandle->callbackFnPtr))(userHandle->callbackData);
2161     } else {
2162         dcmfDirectMsgHeader msgHead;
2163         msgHead.recverBuf=userHandle->recverBuf;
2164         msgHead.callbackFnPtr=userHandle->callbackFnPtr;
2165         msgHead.callbackData=userHandle->callbackData;
2166         msgHead.DCMF_rq_t=(DCMF_Request_t *) userHandle->DCMF_rq_trecv;
2167 #if CMK_SMP
2168         DCMF_CriticalSection_enter (0);
2169 #endif
2170 #if CMI_DIRECT_DEBUG
2171         CmiPrintf("[%d] RDMA put addr %p %d to recverNode %d receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,userHandle->recverBufSize, userHandle->recverNode,userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2172 #endif
2173         DCMF_Send (protocol,
2174                    (DCMF_Request_t *) userHandle->DCMF_rq_tsend,
2175                    directcb, DCMF_MATCH_CONSISTENCY, userHandle->recverNode,
2176                    userHandle->recverBufSize, userHandle->senderBuf,
2177                    (struct DCQuad *) &(msgHead), 2);
2178
2179 #if CMK_SMP
2180         DCMF_CriticalSection_exit (0);
2181 #endif
2182     }
2183 }
2184
2185 void CmiDirect_get(struct infiDirectUserHandle *userHandle) {
2186     CmiAbort("Not Implemented, switch to #define BGP_USE_RDMA_DIRECT");
2187 }
2188
2189 /**** up to the user to safely call this */
2190 void CmiDirect_deassocLocalBuffer(struct infiDirectUserHandle *userHandle)
2191 {
2192     CmiAssert(userHandle->senderNode==_Cmi_mynode);
2193 #if CMK_SMP
2194     DCMF_CriticalSection_enter (0);
2195 #endif
2196     CmiFree(userHandle->DCMF_rq_tsend);
2197 #if CMK_SMP
2198     DCMF_CriticalSection_exit (0);
2199 #endif
2200
2201 }
2202
2203 /**** up to the user to safely call this */
2204 void CmiDirect_destroyHandle(struct infiDirectUserHandle *userHandle){
2205     CmiAssert(userHandle->recverNode==_Cmi_mynode);
2206 #if CMK_SMP
2207     DCMF_CriticalSection_enter (0);
2208 #endif
2209     CmiFree(userHandle->DCMF_rq_trecv);
2210
2211 #if CMK_SMP
2212     DCMF_CriticalSection_exit (0);
2213 #endif
2214 }
2215
2216
2217 /**** Should not be called the first time *********/
2218 void CmiDirect_ready(struct infiDirectUserHandle *userHandle) {
2219     /* no op on BGP */
2220 }
2221
2222 /**** Should not be called the first time *********/
2223 void CmiDirect_readyPollQ(struct infiDirectUserHandle *userHandle) {
2224     /* no op on BGP */
2225 }
2226
2227 /**** Should not be called the first time *********/
2228 void CmiDirect_readyMark(struct infiDirectUserHandle *userHandle) {
2229     /* no op on BGP */
2230 }
2231
2232 #endif /* BGP_USE_AM_DIRECT*/
2233
2234 #ifdef BGP_USE_RDMA_DIRECT
2235
2236 #include "cmidirect.h"
2237
2238 /* 
2239    Notification protocol passes callback function and data in a single
2240    quadword.  This occurs in a message triggered by the sender side ack
2241    callback and therefore has higher latency than polling, but is guaranteed
2242    to be semantically correct.  The latency for a single packet that isn't
2243    hitting charm/converse should be pretty minimal, but you could run into
2244    sender side progress issues.  The alternative of polling on the out of band
2245    byte scheme creates correctness issues in that the data really has to be
2246    out of band and you rely on the buffer being written in order.  It also has
2247    annoying polling issues.  A third scheme could add a second put to a
2248    control region to poll upon and force sequential consistency between
2249    puts. Its not really clear that this would be faster or avoid the progress
2250    issue since you run into the same issues to enforce that sequential
2251    consistency.
2252
2253    EJB   2011/1/20
2254 */
2255
2256
2257 /* local function to use the ack as our signal to send a remote notify */
2258 static void CmiNotifyRemoteRDMA(void *handle, struct DCMF_Error_t *error)
2259 {
2260     struct infiDirectUserHandle *userHandle= (struct infiDirectUserHandle *) handle;
2261     dcmfDirectRDMAMsgHeader msgHead;
2262     msgHead.callbackFnPtr=userHandle->callbackFnPtr;
2263     msgHead.callbackData=userHandle->callbackData;
2264 #if CMK_SMP
2265     DCMF_CriticalSection_enter (0);
2266 #endif
2267 #if CMI_DIRECT_DEBUG
2268     CmiPrintf("[%d] RDMA notify put addr %p %d to recverNode %d receiver addr %p callback %p callbackdata %p \n",CmiMyPe(),userHandle->senderBuf,userHandle->recverBufSize, userHandle->recverNode,userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2269 #endif
2270     DCMF_Result res=DCMF_Send (&cmi_dcmf_direct_rdma_registration,
2271                userHandle->DCMF_rq_tsend,
2272                directcb, DCMF_MATCH_CONSISTENCY, userHandle->recverNode,
2273                sizeof(dcmfDirectRDMAMsgHeader), 
2274
2275                                userHandle->DCMF_notify_buf,
2276                (struct DCQuad *) &(msgHead), 1);
2277 //    CmiAssert(res==DCMF_SUCCESS);
2278 #if CMK_SMP
2279     DCMF_CriticalSection_exit (0);
2280 #endif    
2281 }
2282
2283 /**
2284  To be called on the receiver to create a handle and return its number
2285 **/
2286
2287
2288 struct infiDirectUserHandle CmiDirect_createHandle(int senderNode,void *recvBuf, int recvBufSize, void (*callbackFnPtr)(void *), void *callbackData,double initialValue) {
2289     /* one-sided primitives require registration of memory */
2290     struct infiDirectUserHandle userHandle;
2291     size_t numbytesRegistered=0;
2292     DCMF_Result regresult=DCMF_Memregion_create( &userHandle.DCMF_recverMemregion,
2293                                                  &numbytesRegistered,
2294                                                  recvBufSize,
2295                                                  recvBuf,
2296                                                  0);
2297     CmiAssert(numbytesRegistered==recvBufSize);
2298     CmiAssert(regresult==DCMF_SUCCESS);
2299     
2300
2301     userHandle.handle=1; /* doesn't matter on BG/P*/
2302     userHandle.senderNode=senderNode;
2303     userHandle.recverNode=_Cmi_mynode;
2304     userHandle.recverBufSize=recvBufSize;
2305     userHandle.recverBuf=recvBuf;
2306     userHandle.initialValue=initialValue;
2307     userHandle.callbackFnPtr=callbackFnPtr;
2308     userHandle.callbackData=callbackData;
2309     userHandle.DCMF_rq_trecv=(DCMF_Request_t *) ALIGN_16(CmiAlloc(sizeof(DCMF_Request_t)+16));
2310 #if CMI_DIRECT_DEBUG
2311     CmiPrintf("[%d] RDMA create addr %p %d callback %p callbackdata %p\n",CmiMyPe(),userHandle.recverBuf,userHandle.recverBufSize, userHandle.callbackFnPtr, userHandle.callbackData);
2312 #endif
2313     return userHandle;
2314 }
2315
2316 /****
2317  To be called on the sender to attach the sender's buffer to this handle
2318 ******/
2319
2320 void CmiDirect_assocLocalBuffer(struct infiDirectUserHandle *userHandle,void *sendBuf,int sendBufSize) {
2321     /* one-sided primitives would require registration of memory */
2322     userHandle->senderBuf=sendBuf;
2323     CmiAssert(sendBufSize==userHandle->recverBufSize);
2324     userHandle->DCMF_rq_tsend =(DCMF_Request_t *) ALIGN_16(CmiAlloc(sizeof(DCMF_Request_t)+16));
2325     size_t numbytesRegistered=0;  // set as return value from create
2326     userHandle->DCMF_notify_buf=ALIGN_16(CmiAlloc(sizeof(DCMF_Request_t)+32));
2327     userHandle->DCMF_notify_cb.function=CmiNotifyRemoteRDMA; 
2328     userHandle->DCMF_notify_cb.clientdata=userHandle;
2329     DCMF_Result regresult=DCMF_Memregion_create( &userHandle->DCMF_senderMemregion,
2330                                                  &numbytesRegistered,
2331                                                  sendBufSize,
2332                                                  sendBuf,
2333                                                  0);
2334     CmiAssert(numbytesRegistered==sendBufSize);
2335     CmiAssert(regresult==DCMF_SUCCESS);
2336
2337 #if CMI_DIRECT_DEBUG
2338     CmiPrintf("[%d] RDMA assoc addr %p %d to receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,sendBufSize, userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2339 #endif
2340
2341 }
2342
2343
2344 /****
2345 To be called on the sender to do the actual data transfer
2346 ******/
2347 void CmiDirect_put(struct infiDirectUserHandle *userHandle) {
2348     /** invoke a DCMF_Put with the direct callback */
2349
2350     CmiAssert(userHandle->recverBuf!=NULL);
2351     CmiAssert(userHandle->senderBuf!=NULL);
2352     CmiAssert(userHandle->recverBufSize>0);
2353     if (userHandle->recverNode== _Cmi_mynode) {     /* local copy */
2354 #if CMI_DIRECT_DEBUG
2355         CmiPrintf("[%d] RDMA local put addr %p %d to recverNode %d receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,userHandle->recverBufSize, userHandle->recverNode,userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2356 #endif
2357
2358         CmiMemcpy(userHandle->recverBuf,userHandle->senderBuf,userHandle->recverBufSize);
2359         (*(userHandle->callbackFnPtr))(userHandle->callbackData);
2360     } else {
2361 #if CMK_SMP
2362         DCMF_CriticalSection_enter (0);
2363 #endif
2364 #if CMI_DIRECT_DEBUG
2365         CmiPrintf("[%d] RDMA put addr %p %d to recverNode %d receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,userHandle->recverBufSize, userHandle->recverNode,userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2366 #endif
2367         DCMF_Result 
2368             Res= DCMF_Put(&cmi_dcmf_direct_put_registration,
2369                           userHandle->DCMF_rq_tsend,
2370                           directcb, DCMF_RELAXED_CONSISTENCY, 
2371                           userHandle->recverNode,
2372                           userHandle->recverBufSize,
2373                           &userHandle->DCMF_senderMemregion,
2374                           &userHandle->DCMF_recverMemregion,
2375                           0, /* offsets are zero */
2376                           0, 
2377                           userHandle->DCMF_notify_cb
2378                           );
2379         CmiAssert(Res==DCMF_SUCCESS); 
2380 #if CMK_SMP
2381         DCMF_CriticalSection_exit (0);
2382 #endif
2383     }
2384 }
2385
2386 /****
2387 To be called on the receiver to initiate the actual data transfer
2388 ******/
2389 void CmiDirect_get(struct infiDirectUserHandle *userHandle) {
2390     /** invoke a DCMF_Get with the direct callback */
2391
2392     CmiAssert(userHandle->recverBuf!=NULL);
2393     CmiAssert(userHandle->senderBuf!=NULL);
2394     CmiAssert(userHandle->recverBufSize>0);
2395     if (userHandle->recverNode== _Cmi_mynode) {     /* local copy */
2396 #if CMI_DIRECT_DEBUG
2397         CmiPrintf("[%d] RDMA local get addr %p %d to recverNode %d receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,userHandle->recverBufSize, userHandle->recverNode,userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2398 #endif
2399
2400         CmiMemcpy(userHandle->senderBuf,userHandle->recverBuf,userHandle->recverBufSize);
2401         (*(userHandle->callbackFnPtr))(userHandle->callbackData);
2402     } else {
2403         struct DCMF_Callback_t done_cb;
2404         done_cb.function=userHandle->callbackFnPtr;
2405         done_cb.clientdata=userHandle->callbackData;
2406 #if CMK_SMP
2407         DCMF_CriticalSection_enter (0);
2408 #endif
2409 #if CMI_DIRECT_DEBUG
2410         CmiPrintf("[%d] RDMA get addr %p %d to recverNode %d receiver addr %p callback %p callbackdata %p\n",CmiMyPe(),userHandle->senderBuf,userHandle->recverBufSize, userHandle->recverNode,userHandle->recverBuf, userHandle->callbackFnPtr, userHandle->callbackData);
2411 #endif
2412         DCMF_Result 
2413             Res= DCMF_Get(&cmi_dcmf_direct_get_registration,
2414                           (DCMF_Request_t *) userHandle->DCMF_rq_tsend,
2415                           done_cb, DCMF_RELAXED_CONSISTENCY, 
2416                           userHandle->recverNode,
2417                           userHandle->recverBufSize,
2418                           & userHandle->DCMF_recverMemregion,
2419                           & userHandle->DCMF_senderMemregion,
2420                           0, /* offsets are zero */
2421                           0
2422                           );
2423         CmiAssert(Res==DCMF_SUCCESS); 
2424
2425
2426 #if CMK_SMP
2427         DCMF_CriticalSection_exit (0);
2428 #endif
2429     }
2430 }
2431
2432 /**** up to the user to safely call this */
2433 void CmiDirect_deassocLocalBuffer(struct infiDirectUserHandle *userHandle)
2434 {
2435     CmiAssert(userHandle->senderNode==_Cmi_mynode);
2436 #if CMK_SMP
2437     DCMF_CriticalSection_enter (0);
2438 #endif
2439
2440     DCMF_Memregion_destroy((DCMF_Memregion_t*) userHandle->DCMF_senderMemregion);
2441     CmiFree(userHandle->DCMF_notify_buf);
2442     CmiFree(userHandle->DCMF_rq_tsend);
2443 #if CMK_SMP
2444     DCMF_CriticalSection_exit (0);
2445 #endif
2446
2447 }
2448
2449 /**** up to the user to safely call this */
2450 void CmiDirect_destroyHandle(struct infiDirectUserHandle *userHandle){
2451     CmiAssert(userHandle->recverNode==_Cmi_mynode);
2452 #if CMK_SMP
2453     DCMF_CriticalSection_enter (0);
2454 #endif
2455
2456     DCMF_Memregion_destroy((DCMF_Memregion_t*) userHandle->DCMF_recverMemregion);
2457     CmiFree(userHandle->DCMF_rq_trecv);
2458
2459 #if CMK_SMP
2460     DCMF_CriticalSection_exit (0);
2461 #endif
2462 }
2463
2464
2465
2466 /**** Should not be called the first time *********/
2467 void CmiDirect_ready(struct infiDirectUserHandle *userHandle) {
2468     /* no op on BGP */
2469 }
2470
2471 /**** Should not be called the first time *********/
2472 void CmiDirect_readyPollQ(struct infiDirectUserHandle *userHandle) {
2473     /* no op on BGP */
2474 }
2475
2476 /**** Should not be called the first time *********/
2477 void CmiDirect_readyMark(struct infiDirectUserHandle *userHandle) {
2478     /* no op on BGP */
2479 }
2480
2481 #endif /* BGP_USE_RDMA_DIRECT*/
2482