Merge branch 'charm' into gupta/ibverbs-non-mellanox
authorgupta59 <gupta59@drfe105.fas.fa.disney.com>
Mon, 24 Jun 2013 19:39:03 +0000 (12:39 -0700)
committergupta59 <gupta59@drfe105.fas.fa.disney.com>
Mon, 24 Jun 2013 19:39:03 +0000 (12:39 -0700)
Conflicts:
src/arch/net/machine-ibverbs.c

1  2 
src/arch/net/machine-ibverbs.c

index a3d6026104d28ddd3d3b7d7e6f424fb8502a9538,97f1f22f2107aa279fbb77937318f7bd16601add..714c62e82e4f984a4a86585bb5b1a890d9c0ad4c
  
  #include <infiniband/verbs.h>
  
++#define QLOGIC
++#ifndef QLOGIC
  enum ibv_mtu mtu = IBV_MTU_2048;
++#else
++enum ibv_mtu mtu = IBV_MTU_4096;
++#endif
  static int page_size;
  static int mtu_size;
  static int packetSize;
@@@ -684,7 -678,7 +689,7 @@@ void CmiCommunicationInit(char **argv
  void createLocalQps(struct ibv_device *dev,int ibPort, int myNode,int numNodes,struct infiAddr *localAddr){
        int myLid;
        int i;
--      
++      int err;
        
        //find my lid
        myLid = getLocalLid(context->context,ibPort);
        MACHSTATE1(3,"sendCq created %p",context->sendCq);
        
        
--      context->recvCqSize = maxRecvBuffers;
++      context->recvCqSize = maxRecvBuffers+2;
        context->recvCq = ibv_create_cq(context->context,context->recvCqSize,NULL,NULL,0);
        
        MACHSTATE2(3,"recvCq created %p %d",context->recvCq,context->recvCqSize);
  
        if(numNodes > 1)
        {
+               struct ibv_qp_attr attr;
 +#ifndef NON_SRQ
                context->srqSize = (maxRecvBuffers+2);
+               {
                struct ibv_srq_init_attr srqAttr = {
                        .attr = {
                        .max_wr  = context->srqSize,
                };
                context->srq = ibv_create_srq(context->pd,&srqAttr);
                CmiAssert(context->srq != NULL);
+               }
 +#endif        
+               
+               {
                struct ibv_qp_init_attr initAttr = {
                        .qp_type = IBV_QPT_RC,
                        .send_cq = context->sendCq,
                        .cap     = {
                                .max_send_wr  = maxTokens,
                                .max_send_sge = 2,
-                               .max_recv_sge = 2,
 +#ifdef NON_SRQ
 +                              .max_recv_wr  = maxRecvBuffers, // or maxRecvBuffers
++                              .max_recv_sge = 1,
 +#endif        
                        },
                };
-               struct ibv_qp_attr attr;
  
                attr.qp_state        = IBV_QPS_INIT;
                attr.pkey_index      = 0;
                attr.port_num        = ibPort;
--              attr.qp_access_flags = IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
++              attr.qp_access_flags = IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
  
  /*            MACHSTATE1(3,"context->pd %p",context->pd);
                struct ibv_qp *qp = ibv_create_qp(context->pd,&initAttr);
                                MACHSTATE2(3,"qp[%d] created %p",n,context->qp[n]);
                                CmiAssert(context->qp[n] != NULL);
                        
--                              ibv_modify_qp(context->qp[n], &attr,
++                              if(err= ibv_modify_qp(context->qp[n], &attr,
                                          IBV_QP_STATE              |
                                          IBV_QP_PKEY_INDEX         |
                                        IBV_QP_PORT               |
--                                      IBV_QP_ACCESS_FLAGS);           
--
++                                      IBV_QP_ACCESS_FLAGS)) {
++                                                              MACHSTATE1(3,"ERROR modifying  to INIT %d",err);
++                                                                      CmiAbort("failed to change qp state to INIT ");
++                                                                }
                                localAddr[n].qpn = context->qp[n]->qp_num;
                                localAddr[n].psn = lrand48() & 0xffffff;
                                MACHSTATE4(3,"i %d lid Ox%x qpn 0x%x psn 0x%x",n,localAddr[n].lid,localAddr[n].qpn,localAddr[n].psn);
@@@ -856,23 -847,23 +868,33 @@@ struct infiOtherNodeData *initInfiOther
        MACHSTATE(3,"qp state changed to RTR");
        
        attr.qp_state       = IBV_QPS_RTS;
++// Here NON_SRQ is for QLOGIC
++#ifndef QLOGIC
        attr.timeout        = 26;
        attr.retry_cnt      = 20;
++#else
++      attr.timeout        = 14;
++      attr.retry_cnt      = 7;
++#endif
        attr.rnr_retry      = 7;
        attr.sq_psn         = context->localAddr[node].psn;
        attr.max_rd_atomic  = 1;
  
++      MACHSTATE3(3,"dlid 0x%x qp 0x%x psn 0x%x",attr.ah_attr.dlid,attr.dest_qp_num,attr.sq_psn);
        
--      if (ibv_modify_qp(ret->qp, &attr,
++
++      if (err=ibv_modify_qp(ret->qp, &attr,
          IBV_QP_STATE              |
          IBV_QP_TIMEOUT            |
          IBV_QP_RETRY_CNT          |
          IBV_QP_RNR_RETRY          |
          IBV_QP_SQ_PSN             |
          IBV_QP_MAX_QP_RD_ATOMIC)) {
--                      fprintf(stderr, "Failed to modify QP to RTS\n");
--                      exit(1);
++                      MACHSTATE1(3,"ERROR changing qp state to RTS %d",err);
++                      CmiAbort("failed1 to change qp state to RTS");
        }
++
++
        MACHSTATE(3,"qp state changed to RTS");
  
        MACHSTATE(3,"} initInfiOtherNodeData");
@@@ -895,14 -885,6 +917,14 @@@ void     infiPostInitialRecvs()
        }else{
                numPosts = 0;
        }
 +#ifdef NON_SRQ
 +// This is resulting in the total recv buffers to grow as the number of nodes. What could be the alternative? May be adaptively increase number of buffers for the most communicating nodes. Need a mechanism for such flow control, existing does not claim to work.
 +      minPerProcessorRecvs = 10;
 +      if(minPerProcessorRecvs*(_Cmi_numnodes-1) <= maxRecvBuffers){
 +              numPosts = minPerProcessorRecvs*(_Cmi_numnodes-1);
 +      }
 +#endif
++//        numPosts=1000; 
        if(numPosts > 0){
                context->recvBufferPool = allocateInfiBufferPool(numPosts,packetSize);
                postInitialRecvs(context->recvBufferPool,numPosts,packetSize);
@@@ -995,24 -976,6 +1017,30 @@@ void postInitialRecvs(struct infiBuffer
        if(ibv_post_srq_recv(context->srq,workRequests,&bad_wr)){
                CmiAssert(0);
        }
-       int numNodes = _Cmi_numnodes;
-       int myNode = _Cmi_mynode;
-       int perNodeRecvs = numRecvs/(numNodes-1);
-       int k =0,i;
 +#else 
 +// create a pool per processor and post initial receives to processor queue similar to send, split the buffer pool Equi-partitioning recv pool
-                 int n = (myNode + i)%numNodes;
++       { 
++        int myNode;
++      int numNodes;
++        int perNodeRecvs,k,i,n;
++      numNodes = _Cmi_numnodes;
++      myNode = _Cmi_mynode;
++      perNodeRecvs = numRecvs/(numNodes-1);
++      k =0;
 +      for( i=1;i<numNodes;i++){
++                n = (myNode + i)%numNodes;
 +              if(n  != myNode){ 
 +                              if (k==numNodes-2) 
++
 +                                      workRequests[numRecvs-1].next = NULL;
 +                              else
 +                                      workRequests[(k+1)*perNodeRecvs-1].next = NULL;
 +                              if(ibv_post_recv(context->qp[n],&workRequests[k*perNodeRecvs],&bad_wr)){CmiAssert(0);}
 +                              k++;
 +                              }
 +          }
++        }
 +#endif
  
        free(workRequests);
        free(sgElements);
@@@ -2215,10 -2178,9 +2249,10 @@@ static void increasePostedRecvs(int nod
        }
        node->infiData->postedRecvs+= recvIncrease;
        context->srqSize += recvIncrease;
 +#endif
        MACHSTATE3(3,"Increase tokens by %d to %d for node %d ",tokenIncrease,node->infiData->postedRecvs,nodeNo);
        //increase the size of the recvCq
-       int currentCqSize = context->recvCqSize;
+       currentCqSize = context->recvCqSize;
        if(ibv_resize_cq(context->recvCq,currentCqSize+tokenIncrease)){
                CmiAssert(0);
        }