Merge remote branch 'origin/charm' into charmrun_merge
authorAbhishek Gupta <gupta59@illinois.edu>
Sun, 31 Oct 2010 11:43:39 +0000 (06:43 -0500)
committerAbhishek Gupta <gupta59@illinois.edu>
Sun, 31 Oct 2010 11:43:39 +0000 (06:43 -0500)
Conflicts:
src/arch/net/charmrun/Makefile
src/arch/net/charmrun/charmrun.c

1  2 
src/arch/net/charmrun/Makefile
src/arch/net/charmrun/charmrun.c
src/arch/net/machine.c
src/util/sockRoutines.c

index 11dd413f9ded1c07b74cfacfec7100b90444b029,6f5e6c562c35f35335f28d7ac8684c31afdeca8f..7d63519fa2777f9dc151298cacfbb89bde3d96d6
@@@ -1,5 -1,5 +1,5 @@@
  BIN=../../bin
- CHARMC=$(BIN)/charmc $(OPTS) -lm
 -CHARMC=$(BIN)/charmc $(OPTS) -I..
++CHARMC=$(BIN)/charmc $(OPTS) -lm -I..
  
  SHELL=/bin/sh
  
index 90001d567eaee39100ac6fb8b5588018468604d7,e2d42e04b4af5c2d52a7dfe3200a8ed87ded762f..404ac9aecb74c393c976cbc52cfef4d1e9362265
  #define MAXPATHLEN 1024
  #endif
  
- #define HSTART
 +
++//#define HSTART
 +#ifdef HSTART
 +/*Hierarchical-start routines*/
 +int mynodes_start ;   /* To keep a global node numbering */
 +
 +#endif
 +
  static double ftTimer;
  
  double start_timer;
@@@ -1529,15 -1455,19 +1575,19 @@@ void req_ccs_connect(void
  
  #define LOOPBACK 0
  #if LOOPBACK /*Immediately reply "there's nothing!" (for performance testing)*/
 -    CcsServer_sendReply(&h.hdr,0,0);
 +  CcsServer_sendReply(&h.hdr,0,0);
  #else
-   /*Fill out the charmrun header & forward the CCS request*/
-   ChMessageHeader_new("req_fw",sizeof(h.hdr)+reqBytes,&h.ch);  
-   
-   bufs[0]=&h; lens[0]=sizeof(h);
-   bufs[1]=reqData; lens[1]=reqBytes;
-   
-   skt_sendV(nodetab_ctrlfd(pe),2,bufs,lens);
+     int destpe = pe;
+ #if CMK_BLUEGENE_CHARM
+     destpe = destpe % nodetab_size;
+ #endif
+     if (replay_single) destpe = 0;
+     /*Fill out the charmrun header & forward the CCS request*/
+     ChMessageHeader_new("req_fw",sizeof(h.hdr)+reqBytes,&h.ch);  
+     bufs[0]=&h; lens[0]=sizeof(h);
+     bufs[1]=reqData; lens[1]=reqBytes;
+     skt_sendV(nodetab_ctrlfd(destpe),2,bufs,lens);
  
  #endif
    }
@@@ -2659,13 -2065,8 +2712,13 @@@ int client_connect_problem(int code,con
        return -1;
  }
  
              /** return 1 if connection is openned succesfully with client**/
+ /** return 1 if connection is openned succesfully with client**/
  int errorcheck_one_client_connect(int client){
 +#ifdef HSTART
 +      /* Child charmruns are already connected - Do we need to conect again*/ 
 +      if(arg_hierarchical_start && !arg_child_charmrun && charmrun_phase ==1) 
 +            return 1; 
 +#endif 
        unsigned int clientPort;/*These are actually ignored*/
        skt_ip_t clientIP;
        if (arg_verbose) printf("Charmrun> Waiting for %d-th client to connect.\n",client);
@@@ -2893,154 -2195,76 +2949,154 @@@ void req_one_client_connect(int client
  void exchange_qpdata_clients(){
        int proc,i;
        for( i=0;i<nodetab_rank0_size;i++){
+               int nt=nodetab_rank0_table[i];/*Nodetable index for this node*/ 
+               nodetab_table[nt]->qpData = malloc(sizeof(ChInfiAddr)*nodetab_rank0_size);
+       }
+       for(proc =0;proc< nodetab_rank0_size;proc++){
+               int count=0;
+               for(i=0;i<nodetab_rank0_size;i++){
+                       if(i == proc){
+                       }else{
                                int nt=nodetab_rank0_table[i];/*Nodetable index for this node*/ 
-                               nodetab_table[nt]->qpData = malloc(sizeof(ChInfiAddr)*nodetab_rank0_size);
+                               nodetab_table[nt]->qpData[proc] =  nodeinfo_arr[proc].qpList[count];
+       //                      printf("Charmrun> nt %d proc %d lid 0x%x qpn 0x%x psn 0x%x\n",nt,proc,ChMessageInt(nodetab_table[nt]->qpData[proc].lid),ChMessageInt(nodetab_table[nt]->qpData[proc].qpn),ChMessageInt(nodetab_table[nt]->qpData[proc].psn));
+                               count++;
                        }
-                       for(proc =0;proc< nodetab_rank0_size;proc++){
-                               int count=0;
-                               for(i=0;i<nodetab_rank0_size;i++){
-                                       if(i == proc){
-                                       }else{
-                                               int nt=nodetab_rank0_table[i];/*Nodetable index for this node*/ 
-                                               nodetab_table[nt]->qpData[proc] =  nodeinfo_arr[proc].qpList[count];
-                       //                      printf("Charmrun> nt %d proc %d lid 0x%x qpn 0x%x psn 0x%x\n",nt,proc,ChMessageInt(nodetab_table[nt]->qpData[proc].lid),ChMessageInt(nodetab_table[nt]->qpData[proc].qpn),ChMessageInt(nodetab_table[nt]->qpData[proc].psn));
-                                               count++;
-                                       }
-                               }
+               }
 -              free(nodeinfo_arr[proc].qpList);
 -      }
 -};
 +                              free(nodeinfo_arr[proc].qpList);
 +                      }
 +              };
 +
 +              void    send_clients_nodeinfo_qpdata(){
 +                      int node;
 +                      int msgSize = sizeof(ChMessageInt_t)+sizeof(ChNodeinfo)*nodetab_rank0_size+sizeof(ChInfiAddr)*nodetab_rank0_size;
 +                      for(node=0;node<nodetab_rank0_size;node++){
 +                              int nt=nodetab_rank0_table[node];/*Nodetable index for this node*/
 +              //              printf("Charmrun> Node %d proc %d sending initnodetab \n",node,nt);
 +                              ChMessageHeader hdr;
 +                              ChMessageInt_t nNodes=ChMessageInt_new(nodetab_rank0_size);
 +                              ChMessageHeader_new("initnodetab",msgSize,&hdr);
 +                              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&hdr,sizeof(hdr));
 +                              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&nNodes,sizeof(nNodes));
 +                              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)nodeinfo_arr,sizeof(ChNodeinfo)*nodetab_rank0_size);
 +                              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&nodetab_table[nt]->qpData[0],sizeof(ChInfiAddr)*nodetab_rank0_size);                         
 +                      }
 +              }
 +#endif
  
 -void  send_clients_nodeinfo_qpdata(){
 -      int node;
 -      int msgSize = sizeof(ChMessageInt_t)+sizeof(ChNodeinfo)*nodetab_rank0_size+sizeof(ChInfiAddr)*nodetab_rank0_size;
 -      for(node=0;node<nodetab_rank0_size;node++){
 -              int nt=nodetab_rank0_table[node];/*Nodetable index for this node*/
 -//            printf("Charmrun> Node %d proc %d sending initnodetab \n",node,nt);
 -              ChMessageHeader hdr;
 -              ChMessageInt_t nNodes=ChMessageInt_new(nodetab_rank0_size);
 -              ChMessageHeader_new("initnodetab",msgSize,&hdr);
 -              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&hdr,sizeof(hdr));
 -              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&nNodes,sizeof(nNodes));
 -              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)nodeinfo_arr,sizeof(ChNodeinfo)*nodetab_rank0_size);
 -              skt_sendN(nodetab_table[nt]->ctrlfd,(const char *)&nodetab_table[nt]->qpData[0],sizeof(ChInfiAddr)*nodetab_rank0_size);                         
 -      }
 -}
 +              struct timeval tim;
 +#define  getthetime(x) gettimeofday(&tim,NULL); x = tim.tv_sec + (tim.tv_usec/1000000.0);
 +#define getthetime1(x) gettimeofday(&tim,NULL); x = tim.tv_sec ;
 +              /*Wait for all the clients to connect to our server port*/
 +              void req_client_connect(void)
 +              {
 +                      int client;
 +#ifdef HSTART
 +                      if(!arg_hierarchical_start)
  #endif
 +                      nodeinfo_allocate();
 +                      req_nClients=nodetab_rank0_size;
 +                      req_clients=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
 +                      for(client=0;client<req_nClients;client++)
 +                              req_clients[client]=-1;
 +                      
 +                      skt_set_abort(client_connect_problem);
 +                      
 +#if CMK_IBVERBS_FAST_START
 +                      for (client=0;client<req_nClients;client++){
 +                              req_one_client_partinit(client);
 +                      }
 +                      for (client=0;client<req_nClients;client++){
 +                              read_initnode_one_client(client);
 +                      }
 +#else
  
 +                      req_set_client_connect(0,req_nClients);
  
 -/*Wait for all the clients to connect to our server port*/
 -void req_client_connect(void)
 -{
 -      int client;
 -      nodeinfo_allocate();
 -      req_nClients=nodetab_rank0_size;
 -      req_clients=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
 -      for(client=0;client<req_nClients;client++)
 -              req_clients[client]=-1;
 -      
 -      skt_set_abort(client_connect_problem);
 -      
 +#endif
 +                      
 +                              if (portOk == 0) exit(1);
 +                      if (arg_verbose) printf("Charmrun> All clients connected.\n");
 +#if CMK_USE_IBVERBS
 +                      exchange_qpdata_clients();
 +                      send_clients_nodeinfo_qpdata();
 +#else
 +#ifdef HSTART
 +                      if(arg_hierarchical_start) {
 +                              /* first we need to send data to parent charmrun and then send the nodeinfo to the clients*/
 +                      send_myNodeInfo_to_parent();
 +                      /*then receive from root */
 +                      forward_nodetab_to_children();
 +                      }
 +
 +                      else 
 +#endif
 +                      for (client=0;client<req_nClients;client++)     {                       
 +                              req_handle_initnodetab(NULL,req_clients[client]);
 +                      }
 +                      
 +#endif
 +                      if (arg_verbose) printf("Charmrun> IP tables sent.\n");
 +              }
 +              /*Wait for all the clients to connect to our server port, then collect and send nodetable to all */
 +#ifdef HSTART
 +              void req_charmrun_connect(void)
 +              {
 +              //      double t1, t2, t3, t4;
 +                      int client;
 +                      nodeinfo_allocate();
 +                      req_nClients=branchfactor;
 +                      req_clients=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
 +                      charmrun_fds=(SOCKET *)malloc(req_nClients*sizeof(SOCKET));
 +                      for(client=0;client<req_nClients;client++)
 +                              req_clients[client]=-1;
 +                      
 +                      skt_set_abort(client_connect_problem);
 +                      
  #if CMK_IBVERBS_FAST_START
 -      for (client=0;client<req_nClients;client++){
 -              req_one_client_partinit(client);
 -      }
 -      for (client=0;client<req_nClients;client++){
 -              read_initnode_one_client(client);
 -      }
 +                      for (client=0;client<req_nClients;client++){
 +                              req_one_client_partinit(client);
 +                      }
 +                      for (client=0;client<req_nClients;client++){
 +                              read_initnode_one_client(client);
 +                      }
  #else
 -      req_set_client_connect(0,req_nClients);
 +//if(!arg_child_charmrun) getthetime(t1);
 +
 +                      req_set_client_connect(0,req_nClients);
 +//if(!arg_child_charmrun)     getthetime(t2);         /* also need to process received nodesets JIT */
  #endif
 -      
 -        if (portOk == 0) exit(1);
 -      if (arg_verbose) printf("Charmrun> All clients connected.\n");
 +                      
 +                              if (portOk == 0) exit(1);
 +                      if (arg_verbose) printf("Charmrun> All clients connected.\n");
  #if CMK_USE_IBVERBS
 -      exchange_qpdata_clients();
 -      send_clients_nodeinfo_qpdata();
 +                      exchange_qpdata_clients();
 +                      send_clients_nodeinfo_qpdata();
  #else
 -      for (client=0;client<req_nClients;client++)
 -              req_handle_initnodetab(NULL,req_clients[client]);
 +                      for (client=0;client<req_nClients;client++)     {
 +                                                      // add flag to check what leval charmrun it is and what phase
 +                              req_handle_initnodedistribution(NULL, charmrun_fds[client], client);
 +                      }
 +//getthetime(t3);
 +
 +                      /* Now receive the nodetab from child charmruns*/
 +                      charmrun_phase = 1;
 +                      
 +                      skt_set_abort(client_connect_problem);
 +
 +                      req_set_client_connect(0,req_nClients);
 +
 +                      /* Already processed, so send*/
 +                      for (client=0;client<req_nClients;client++)     {                       
 +                              req_handle_initnodetab(NULL,req_clients[client]);
 +                      }
 +//if(!arg_child_charmrun) getthetime(t4);
 +#endif
 +                      if (arg_verbose) printf("Charmrun> IP tables sent.\n");
 +//if(!arg_child_charmrun) printf("Time for charmruns connect= %f , sending nodes to fire= %f, node clients connected= %f n ", t2-t1, t3-t2, t4-t3);
 +              }
 +
  #endif
 -      if (arg_verbose) printf("Charmrun> IP tables sent.\n");
 -}
  
  #ifndef CMK_BPROC
  
@@@ -3623,253 -2672,252 +3682,252 @@@ void start_nodes_local(char ** env
  
  #elif CMK_BPROC
  
 -int bproc_nodeisup(int node)
 -{
 -    int status = 0;
 +              int bproc_nodeisup(int node)
 +              {
 +                      int status = 0;
  #if CMK_BPROC_VERSION < 4
 -    if (bproc_nodestatus(node) == bproc_node_up) status = 1;
 -    if (arg_verbose)
 -      printf("Charmrun> node %d status: %s\n", node, status?"up":"down");
 +                      if (bproc_nodestatus(node) == bproc_node_up) status = 1;
 +                      if (arg_verbose)
 +                        printf("Charmrun> node %d status: %s\n", node, status?"up":"down");
  #else
 -    char nodestatus[128];
 -    if (node == -1) {         /* master node is always up */
 -      strcpy(nodestatus, "up");
 -      status = 1;
 -    }
 -    if (bproc_nodestatus(node, nodestatus, 128)) {
 -      if (strcmp(nodestatus, "up")==0) status = 1;
 -    }
 -    if (arg_verbose)
 -        printf("Charmrun> node %d status: %s\n", node, nodestatus);
 +                      char nodestatus[128];
 +                      if (node == -1) {               /* master node is always up */
 +                        strcpy(nodestatus, "up");
 +                        status = 1;
 +                      }
 +                      if (bproc_nodestatus(node, nodestatus, 128)) {
 +                        if (strcmp(nodestatus, "up")==0) status = 1;
 +                      }
 +                      if (arg_verbose)
 +                              printf("Charmrun> node %d status: %s\n", node, nodestatus);
  #endif
 -  return status;
 -}
 +                return status;
 +              }
  
  /**
 -  ++ppn now is supported in both SMP and non SMP version
 -  in SMP, ++ppn specifies number of threads on each node;
 -  in non-SMP, ++ppn specifies number of processes on each node.
 -*/
 -void nodetab_init_for_scyld()
 -{
 -  int maxNodes, i, node, npes, rank;
 -  nodetab_host group;
 -  int tablesize;
 -
 -  tablesize = arg_requested_pes;
 -  maxNodes = bproc_numnodes() + 1;
 -  if (arg_endpe < maxNodes) maxNodes=arg_endpe+1;
 -  if (maxNodes > tablesize) tablesize = maxNodes;
 -  nodetab_table=(nodetab_host**)malloc(tablesize*sizeof(nodetab_host*));
 -  nodetab_rank0_table=(int*)malloc(tablesize*sizeof(int));
 -  nodetab_max=tablesize;
 -
 -  nodetab_reset(&group);
 -
 -  if (arg_ppn==0) arg_ppn=1;
 -/*
 +                ++ppn now is supported in both SMP and non SMP version
 +                in SMP, ++ppn specifies number of threads on each node;
 +                in non-SMP, ++ppn specifies number of processes on each node.
 +              */
 +              void nodetab_init_for_scyld()
 +              {
 +                int maxNodes, i, node, npes, rank;
 +                nodetab_host group;
 +                int tablesize;
 +
 +                tablesize = arg_requested_pes;
 +                maxNodes = bproc_numnodes() + 1;
 +                if (arg_endpe < maxNodes) maxNodes=arg_endpe+1;
 +                if (maxNodes > tablesize) tablesize = maxNodes;
 +                nodetab_table=(nodetab_host**)malloc(tablesize*sizeof(nodetab_host*));
 +                nodetab_rank0_table=(int*)malloc(tablesize*sizeof(int));
 +                nodetab_max=tablesize;
 +
 +                nodetab_reset(&group);
 +
 +                if (arg_ppn==0) arg_ppn=1;
 +              /*
  #if CMK_SHARED_VARS_UNAVAILABLE
 -  if (arg_ppn > 1) {
 -    fprintf(stderr,"Warning> Invalid ppn %d in nodelist ignored.\n", arg_ppn);
 -    arg_ppn=1;
 -  }
 +                if (arg_ppn > 1) {
 +                      fprintf(stderr,"Warning> Invalid ppn %d in nodelist ignored.\n", arg_ppn);
 +                      arg_ppn=1;
 +                }
  #endif
 -*/
 -  group.cpus = 1;
 -  group.rank = 0;
 -
 -  /* check which slave node is available from frompe to endpe */
 -  npes = 0;
 -  for (i=-1; i<maxNodes && npes < arg_requested_pes; i++) {
 -    char hostname[256];
 -    if (!bproc_nodeisup(i)) continue;
 -    if (i!= -1 && i<arg_startpe) continue;
 -    if (i==-1 && arg_skipmaster) continue;    /* skip master node -1 */
 -    sprintf(hostname, "%d", i);
 +              */
 +                group.cpus = 1;
 +                group.rank = 0;
 +
 +                /* check which slave node is available from frompe to endpe */
 +                npes = 0;
 +                for (i=-1; i<maxNodes && npes < arg_requested_pes; i++) {
 +                      char hostname[256];
 +                      if (!bproc_nodeisup(i)) continue;
 +                      if (i!= -1 && i<arg_startpe) continue;
 +                      if (i==-1 && arg_skipmaster) continue;    /* skip master node -1 */
 +                      sprintf(hostname, "%d", i);
  #if ! CMK_SHARED_VARS_UNAVAILABLE
 -    if (npes + arg_ppn > arg_requested_pes) group.cpus = arg_requested_pes-npes;
 -    else group.cpus = arg_ppn;
 +                      if (npes + arg_ppn > arg_requested_pes) group.cpus = arg_requested_pes-npes;
 +                      else group.cpus = arg_ppn;
  #endif
 -    for (rank = 0; rank<arg_ppn; rank++) {
 +                      for (rank = 0; rank<arg_ppn; rank++) {
  #if ! CMK_SHARED_VARS_UNAVAILABLE
 -      group.rank = rank;
 +                        group.rank = rank;
  #endif
 -      nodetab_makehost(hostname, &group);
 -      if (++npes == arg_requested_pes) break;
 -    }   
 -  }
 -  if (nodetab_rank0_size == 0) {
 -    fprintf(stderr, "Charmrun> no slave node available!\n");
 -    exit (1);
 -  }
 -  if (arg_verbose)
 -    printf("Charmrun> There are %d slave nodes available.\n", nodetab_rank0_size-(arg_skipmaster?0:1));
 -
 -  /* expand node table to arg_requested_pes */
 -  if (arg_requested_pes > npes) {
 -    int orig_size = npes;
 -    int node;
 -    int startnode = 0;
 -    if (arg_singlemaster && nodetab_rank0_size > 1 && !arg_skipmaster) 
 -      startnode = arg_ppn;      /* skip -1 */
 -    node = startnode; 
 -    while (npes < arg_requested_pes) {
 +                        nodetab_makehost(hostname, &group);
 +                        if (++npes == arg_requested_pes) break;
 +                      }   
 +                }
 +                if (nodetab_rank0_size == 0) {
 +                      fprintf(stderr, "Charmrun> no slave node available!\n");
 +                      exit (1);
 +                }
 +                if (arg_verbose)
 +                      printf("Charmrun> There are %d slave nodes available.\n", nodetab_rank0_size-(arg_skipmaster?0:1));
 +
 +                /* expand node table to arg_requested_pes */
 +                if (arg_requested_pes > npes) {
 +                      int orig_size = npes;
 +                      int node;
 +                      int startnode = 0;
 +                      if (arg_singlemaster && nodetab_rank0_size > 1 && !arg_skipmaster) 
 +                              startnode = arg_ppn;      /* skip -1 */
 +                      node = startnode; 
 +                      while (npes < arg_requested_pes) {
  #if ! CMK_SHARED_VARS_UNAVAILABLE
 -      if (npes+arg_ppn > arg_requested_pes) group.cpus = arg_requested_pes-npes;
 -      else group.cpus = arg_ppn;
 +                        if (npes+arg_ppn > arg_requested_pes) group.cpus = arg_requested_pes-npes;
 +                        else group.cpus = arg_ppn;
  #endif
 -      for (rank = 0; rank<arg_ppn; rank++) {
 +                        for (rank = 0; rank<arg_ppn; rank++) {
  #if ! CMK_SHARED_VARS_UNAVAILABLE
 -        group.rank = rank;
 +                              group.rank = rank;
  #endif
 -        nodetab_makehost(nodetab_name(node), &group);
 -        if (++node == orig_size) node = startnode;
 -        if (++npes == arg_requested_pes) break;
 -      } 
 -    }
 -  }
 -}
 -
 -void start_nodes_scyld(void)
 -{
 -  char *envp[2];
 -  int i;
 -
 -  envp[0] = (char *)malloc(256);
 -  envp[1] = 0;
 -  for (i=0;i<nodetab_rank0_size;i++)
 -  {
 -    int status = 0;
 -    int pid;
 -    int pe=nodetab_rank0_table[i];
 -    int nodeno = atoi(nodetab_name(pe));
 +                              nodetab_makehost(nodetab_name(node), &group);
 +                              if (++node == orig_size) node = startnode;
 +                              if (++npes == arg_requested_pes) break;
 +                        } 
 +                      }
 +                }
 +              }
  
 -    if (arg_verbose)
 -      printf("Charmrun> start node program on slave node: %d.\n", nodeno);
 -    sprintf(envp[0], "NETSTART=%s",  create_netstart(i));
 -    pid = 0;
 -    pid = fork();
 -    if (pid < 0) exit(1);
 -    if (pid == 0)
 -    {
 -      int fd, fd1 = dup(1);
 -      if (!(arg_debug || arg_debug_no_pause)) {   /* debug mode */
 -        if (fd = open("/dev/null", O_RDWR)) {
 -          dup2(fd, 0); dup2(fd, 1); dup2(fd, 2);
 -        }
 -      }
 -      if (nodeno == -1) {
 -        status = execve(pparam_argv[1], pparam_argv+1, envp);
 -        dup2(fd1, 1);
 -        printf("execve failed to start process \"%s\" with status: %d\n", pparam_argv[1], status);
 -      }
 -      else {
 -        status = bproc_execmove(nodeno, pparam_argv[1], pparam_argv+1, envp);
 -        dup2(fd1, 1);
 -        printf("bproc_execmove failed to start remote process \"%s\" with status: %d\n", pparam_argv[1], status);
 -      }
 -      kill(getppid(), 9);
 -      exit(1);
 -    }
 -  }
 -  free(envp[0]);
 -}
 -void finish_nodes(void) {}
 +              void start_nodes_scyld(void)
 +              {
 +                char *envp[2];
 +                int i;
 +
 +                envp[0] = (char *)malloc(256);
 +                envp[1] = 0;
 +                for (i=0;i<nodetab_rank0_size;i++)
 +                {
 +                      int status = 0;
 +                      int pid;
 +                      int pe=nodetab_rank0_table[i];
 +                      int nodeno = atoi(nodetab_name(pe));
 +
 +                      if (arg_verbose)
 +                        printf("Charmrun> start node program on slave node: %d.\n", nodeno);
 +                      sprintf(envp[0], "NETSTART=%s",  create_netstart(i));
 +                      pid = 0;
 +                      pid = fork();
 +                      if (pid < 0) exit(1);
 +                      if (pid == 0)
 +                      {
 +                        int fd, fd1 = dup(1);
 +                        if (!(arg_debug || arg_debug_no_pause)) {   /* debug mode */
 +                              if (fd = open("/dev/null", O_RDWR)) {
 +                                dup2(fd, 0); dup2(fd, 1); dup2(fd, 2);
 +                              }
 +                        }
 +                        if (nodeno == -1) {
 +                              status = execve(pparam_argv[1], pparam_argv+1, envp);
 +                              dup2(fd1, 1);
 +                              printf("execve failed to start process \"%s\" with status: %d\n", pparam_argv[1], status);
 +                        }
 +                        else {
 +                              status = bproc_execmove(nodeno, pparam_argv[1], pparam_argv+1, envp);
 +                              dup2(fd1, 1);
 +                              printf("bproc_execmove failed to start remote process \"%s\" with status: %d\n", pparam_argv[1], status);
 +                        }
 +                        kill(getppid(), 9);
 +                        exit(1);
 +                      }
 +                }
 +                free(envp[0]);
 +              }
 +              void finish_nodes(void) {}
  
  #else
 -/*Unix systems can use Rsh normally*/
 -/********** RSH-ONLY CODE *****************************************/
 -/*                                                                          */
 -/* Rsh_etc                                                                  */
 -/*                                                                          */
 -/* this starts all the node programs.  It executes fully in the background. */
 -/*                                                                          */
 -/****************************************************************************/
 +              /*Unix systems can use Rsh normally*/
 +              /********** RSH-ONLY CODE *****************************************/
 +              /*                                                                          */
 +              /* Rsh_etc                                                                  */
 +              /*                                                                          */
 +              /* this starts all the node programs.  It executes fully in the background. */
 +              /*                                                                          */
 +              /****************************************************************************/
  #include <sys/wait.h>
  
 -extern char **environ;
 -void removeEnv(const char *doomedEnv)
 -{ /*Remove a value from the environment list*/
 -      char **oe, **ie;
 -      oe=ie=environ;
 -      while (*ie != NULL) {
 -        if (0!=strncmp(*ie,doomedEnv,strlen(doomedEnv)))
 -          *oe++ = *ie;
 -        ie++;
 -      }
 -      *oe=NULL;/*NULL-terminate list*/
 -}
 -
 -int rsh_fork(int nodeno,const char *startScript)
 -{
 -  char **rshargv;
 -  int pid;
 -  int num=0;
 -  char *s, *e;
 -
 -  /* figure out size and dynamic allocate */
 -  s=nodetab_shell(nodeno); e=skipstuff(s);
 -  while (*s) {
 -    num++;
 -    s = skipblanks(e); e = skipstuff(s);
 -  }
 -  rshargv = (char **)malloc(sizeof(char *)*(num+6));
 -
 -  num = 0;
 -  s=nodetab_shell(nodeno); e=skipstuff(s);
 -  while (*s) {
 -    rshargv[num++]=substr(s, e);
 -    s = skipblanks(e); e = skipstuff(s);
 -  }
 +              extern char **environ;
 +              void removeEnv(const char *doomedEnv)
 +              { /*Remove a value from the environment list*/
 +                        char **oe, **ie;
 +                        oe=ie=environ;
 +                        while (*ie != NULL) {
 +                              if (0!=strncmp(*ie,doomedEnv,strlen(doomedEnv)))
 +                                *oe++ = *ie;
 +                              ie++;
 +                        }
 +                        *oe=NULL;/*NULL-terminate list*/
 +              }
  
 -  rshargv[num++]=nodetab_name(nodeno);
 -  rshargv[num++]="-l";
 -  rshargv[num++]=nodetab_login(nodeno);
 -  rshargv[num++]="/bin/sh -f";
 -  rshargv[num++]=0;
 -  if (arg_verbose) printf("Charmrun> Starting %s %s -l %s %s\n",nodetab_shell(nodeno), nodetab_name(nodeno),nodetab_login(nodeno), rshargv[num-2]);
 -  
 -  pid = fork();
 -  if (pid < 0) 
 -      { perror("ERROR> starting rsh"); exit(1); }
 -  if (pid == 0)
 -  {/*Child process*/
 -      int i;
 -      int fdScript=open(startScript,O_RDONLY);
 -  /**/  unlink(startScript); /**/
 -      dup2(fdScript,0);/*Open script as standard input*/
 -      //removeEnv("DISPLAY="); /*No DISPLAY disables ssh's slow X11 forwarding*/
 -      for(i=3; i<1024; i++) close(i);
 -      execvp(rshargv[0], rshargv);
 -      fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
 -      exit(1);
 -  }
 -  free(rshargv);
 -  if (arg_verbose)
 -    fprintf(stderr,"Charmrun> remote shell (%s:%d) started\n",
 -      nodetab_name(nodeno),nodeno);
 -  return pid;
 -}
 +              int rsh_fork(int nodeno,const char *startScript)
 +              {
 +                char **rshargv;
 +                int pid;
 +                int num=0;
 +                char *s, *e;
 +
 +                /* figure out size and dynamic allocate */
 +                s=nodetab_shell(nodeno); e=skipstuff(s);
 +                while (*s) {
 +                      num++;
 +                      s = skipblanks(e); e = skipstuff(s);
 +                }
 +                rshargv = (char **)malloc(sizeof(char *)*(num+6));
 +
 +                num = 0;
 +                s=nodetab_shell(nodeno); e=skipstuff(s);
 +                while (*s) {
 +                      rshargv[num++]=substr(s, e);
 +                      s = skipblanks(e); e = skipstuff(s);
 +                }
 +
 +                rshargv[num++]=nodetab_name(nodeno);
 +                rshargv[num++]="-l";
 +                rshargv[num++]=nodetab_login(nodeno);
 +                rshargv[num++]="/bin/sh -f";
 +                rshargv[num++]=0;
 +                if (arg_verbose) printf("Charmrun> Starting %s %s -l %s %s\n",nodetab_shell(nodeno), nodetab_name(nodeno),nodetab_login(nodeno), rshargv[num-2]);
 +                
 +                pid = fork();
 +                if (pid < 0) 
 +                      { perror("ERROR> starting rsh"); exit(1); }
 +                if (pid == 0)
 +                {/*Child process*/
 +                        int i;
 +                        int fdScript=open(startScript,O_RDONLY);
 +                /**/  unlink(startScript); /**/
 +                        dup2(fdScript,0);/*Open script as standard input*/
 +                        //removeEnv("DISPLAY="); /*No DISPLAY disables ssh's slow X11 forwarding*/
 +                        for(i=3; i<1024; i++) close(i);
 +                        execvp(rshargv[0], rshargv);
 +                        fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
 +                        exit(1);
 +                }
 +                free(rshargv);
 +                if (arg_verbose)
 +                      fprintf(stderr,"Charmrun> remote shell (%s:%d) started\n",
 +                              nodetab_name(nodeno),nodeno);
 +                return pid;
 +              }
  
-               void fprint_arg(FILE *f,char **argv)
-               {
-                 while (*argv) { 
-                       fprintf(f," %s",*argv); 
-                       argv++; 
-                 }
-               }
-               void rsh_Find(FILE *f,const char *program,const char *dest)
-               {
-                       fprintf(f,"Find %s\n",program);
-                       fprintf(f,"%s=$loc\n",dest);
-               }
-               void rsh_script(FILE *f, int nodeno, int rank0no, char **argv, int restart)
-               {
-                 char *netstart;
-                 char *arg_nodeprog_r,*arg_currdir_r;
-                 char *dbg=nodetab_debugger(nodeno);
-                 char *host=nodetab_name(nodeno);
- #define CLOSE_ALL " < /dev/null 1> /dev/null 2> /dev/null &"
+ void fprint_arg(FILE *f,char **argv)
+ {
+   while (*argv) { 
+       fprintf(f," %s",*argv); 
+       argv++; 
+   }
+ }
+ void rsh_Find(FILE *f,const char *program,const char *dest)
+ {
+     fprintf(f,"Find %s\n",program);
+     fprintf(f,"%s=$loc\n",dest);
+ }
+ void rsh_script(FILE *f, int nodeno, int rank0no, char **argv, int restart)
+ {
+   char *netstart;
+   char *arg_nodeprog_r,*arg_currdir_r;
+   char *dbg=nodetab_debugger(nodeno);
+   char *host=nodetab_name(nodeno);
  
    if (arg_mpiexec)
          fprintf(f, "#!/bin/sh\n");
  */
    if (arg_display && !arg_ssh_display)
      fprintf(f,"DISPLAY='%s';export DISPLAY\n",arg_display);
- #ifdef HSTART
-         if(arg_hierarchical_start && arg_child_charmrun)
-                         netstart = create_netstart(mynodes_start+rank0no);
-         else
- #endif
-   netstart = create_netstart(rank0no);
-   fprintf(f,"NETSTART=\"%s\";export NETSTART\n",netstart);
 +
 +#ifdef HSTART
 +  if(arg_child_charmrun)
 +                fprintf(f,"NETMAGIC=\"%d\";export NETMAGIC\n",parent_charmrun_pid&0x7FFF);
 +  else
 +#endif
 +              fprintf(f,"NETMAGIC=\"%d\";export NETMAGIC\n",getpid()&0x7FFF);
 +
-   if (arg_mpiexec)
-     fprintf(f,"CmiMyNode=$OMPI_COMM_WORLD_RANK; export CmiMyNode\n");
+   if (arg_mpiexec) {
+     fprintf(f,"CmiMyNode=$OMPI_COMM_WORLD_RANK\n");
+     fprintf(f,"test -z \"$CmiMyNode\" && CmiMyNode=$MPIRUN_RANK\n");
+     fprintf(f,"test -z \"$CmiMyNode\" && CmiMyNode=$PMI_RANK\n");
+     fprintf(f,"export CmiMyNode\n");
+   }
 +#ifdef HSTART
 +  else  if(arg_hierarchical_start && arg_child_charmrun)
 +                       fprintf(f,"CmiMyNode='%d'; export CmiMyNode\n",mynodes_start+rank0no);
 +#endif
-       else
-                 fprintf(f,"CmiMyNode='%d'; export CmiMyNode\n",rank0no);
+   else
+     fprintf(f,"CmiMyNode='%d'; export CmiMyNode\n",rank0no);
++#ifdef HSTART
++  if(arg_hierarchical_start && arg_child_charmrun)
++                        netstart = create_netstart(mynodes_start+rank0no);
++  else
++#endif
+   netstart = create_netstart(rank0no);
+   fprintf(f,"NETSTART=\"%s\";export NETSTART\n",netstart);
  
    fprintf(f,"CmiMyNodeSize='%d'; export CmiMyNodeSize\n",nodetab_getnodeinfo(rank0no)->cpus);
  
      fprintf(f,"CmiMyForks='%d'; export CmiMyForks\n",0);
    else
      fprintf(f,"CmiMyForks='%d'; export CmiMyForks\n",nodetab_getnodeinfo(rank0no)->forks);
-   if (arg_mpiexec)
-     fprintf(f,"CmiNumNodes=$OMPI_COMM_WORLD_SIZE; export CmiNumNodes\n");
+   if (arg_mpiexec) {
+     fprintf(f,"CmiNumNodes=$OMPI_COMM_WORLD_SIZE\n");
+     fprintf(f,"test -z \"$CmiNumNodes\" && CmiNumNodes=$MPIRUN_NPROCS\n");
+     fprintf(f,"test -z \"$CmiNumNodes\" && CmiNumNodes=$PMI_SIZE\n");
+     fprintf(f,"export CmiNumNodes\n");
+   }
 +#ifdef HSTART
 +  else        if(arg_hierarchical_start && arg_child_charmrun)
 +               fprintf(f,"CmiNumNodes='%d'; export CmiNumNodes\n",nodetab_rank0_size_total);
 +#endif
++
    else
-   fprintf(f,"CmiNumNodes='%d'; export CmiNumNodes\n",nodetab_rank0_size);
+     fprintf(f,"CmiNumNodes='%d'; export CmiNumNodes\n",nodetab_rank0_size);
  #if CONVERSE_VERSION_VMI
    /* VMI environment variable */
    fprintf (f, "VMI_PROCS='%d'; export VMI_PROCS\n", arg_requested_pes);
    
    /* find the current directory, relative version */
    arg_currdir_r = pathfix(arg_currdir_a, nodetab_pathfixes(nodeno));
 -
--  if (arg_verbose) {
 -    printf("Charmrun> find the node program \"%s\" at \"%s\" for %d.\n", arg_nodeprog_r, arg_currdir_r, nodeno);
++  
++ if (arg_verbose) {
 +      printf("Charmrun> find the node program \"%s\" at \"%s\" for %d.\n", arg_nodeprog_r, arg_currdir_r, nodeno);
    }
 -
    if (arg_debug || arg_debug_no_pause || arg_in_xterm) {
 -    rsh_Find(f,nodetab_xterm(nodeno),"F_XTERM");
 -    if(!arg_ssh_display && !arg_debug_no_xrdb)
 -      rsh_Find(f,"xrdb","F_XRDB");
 -    if(arg_verbose) fprintf(f,"Echo 'using xterm' $F_XTERM\n");
 +      rsh_Find(f,nodetab_xterm(nodeno),"F_XTERM");
-                       if(!arg_ssh_display && !arg_debug_no_xrdb)
-                         rsh_Find(f,"xrdb","F_XRDB");
-                       if(arg_verbose) fprintf(f,"Echo 'using xterm' $F_XTERM\n");
-                 }
++      if(!arg_ssh_display && !arg_debug_no_xrdb)
++        rsh_Find(f,"xrdb","F_XRDB");
++      if(arg_verbose) fprintf(f,"Echo 'using xterm' $F_XTERM\n");
+   }
  
-                 if (arg_debug || arg_debug_no_pause)
-                 {/*Look through PATH for debugger*/
-                       rsh_Find(f,dbg,"F_DBG");
-                       if (arg_verbose) fprintf(f,"Echo 'using debugger' $F_DBG\n");
-                 }
+   if (arg_debug || arg_debug_no_pause)
+   {/*Look through PATH for debugger*/
 -    rsh_Find(f,dbg,"F_DBG");
 -    if (arg_verbose) fprintf(f,"Echo 'using debugger' $F_DBG\n");
++      rsh_Find(f,dbg,"F_DBG");
++      if (arg_verbose) fprintf(f,"Echo 'using debugger' $F_DBG\n");
+   }
  
-                  if (!arg_ssh_display && !arg_debug_no_xrdb && 
-                          (arg_debug || arg_debug_no_pause || arg_in_xterm)) {
-                        /*    if (arg_debug || arg_debug_no_pause || arg_in_xterm) {*/
-                       fprintf(f,"$F_XRDB -query > /dev/null\n");
-                       fprintf(f,"if test $? != 0\nthen\n");
-                       fprintf(f,"  Echo 'Cannot contact X Server '$DISPLAY'.  You probably'\n");
-                       fprintf(f,"  Echo 'need to run xhost to authorize connections.'\n");
-                       fprintf(f,"  Echo '(See manual for xhost for security issues)'\n");
-                       fprintf(f,"  Echo 'Or try ++batch 1 ++ssh-display to rely on SSH X11 forwarding'\n");
-                       fprintf(f,"  Exit 1\n");
-                       fprintf(f,"fi\n");
-                 }
-                 
-                 fprintf(f,"if test ! -x \"%s\"\nthen\n",arg_nodeprog_r);
-                 fprintf(f,"  Echo 'Cannot locate this node-program: %s'\n",arg_nodeprog_r);
-                 fprintf(f,"  Exit 1\n");
-                 fprintf(f,"fi\n");
-                 
-                 fprintf(f,"cd \"%s\"\n",arg_currdir_r);
-                 fprintf(f,"if test $? = 1\nthen\n");
-                 fprintf(f,"  Echo 'Cannot propagate this current directory:'\n"); 
-                 fprintf(f,"  Echo '%s'\n",arg_currdir_r);
-                 fprintf(f,"  Exit 1\n");
-                 fprintf(f,"fi\n");
-                 
-                 if (strcmp(nodetab_setup(nodeno),"*")) {
-                       fprintf(f,"%s\n",nodetab_setup(nodeno));
-                       fprintf(f,"if test $? = 1\nthen\n");
-                       fprintf(f,"  Echo 'this initialization command failed:'\n");
-                       fprintf(f,"  Echo '\"%s\"'\n",nodetab_setup(nodeno));
-                       fprintf(f,"  Echo 'edit your nodes file to fix it.'\n");
-                       fprintf(f,"  Exit 1\n");
-                       fprintf(f,"fi\n");
-                 }
+    if (!arg_ssh_display && !arg_debug_no_xrdb && 
+        (arg_debug || arg_debug_no_pause || arg_in_xterm)) {
+      /*    if (arg_debug || arg_debug_no_pause || arg_in_xterm) {*/
+     fprintf(f,"$F_XRDB -query > /dev/null\n");
+     fprintf(f,"if test $? != 0\nthen\n");
+     fprintf(f,"  Echo 'Cannot contact X Server '$DISPLAY'.  You probably'\n");
+     fprintf(f,"  Echo 'need to run xhost to authorize connections.'\n");
+     fprintf(f,"  Echo '(See manual for xhost for security issues)'\n");
+     fprintf(f,"  Echo 'Or try ++batch 1 ++ssh-display to rely on SSH X11 forwarding'\n");
+     fprintf(f,"  Exit 1\n");
+     fprintf(f,"fi\n");
+   }
+   
+   fprintf(f,"if test ! -x \"%s\"\nthen\n",arg_nodeprog_r);
+   fprintf(f,"  Echo 'Cannot locate this node-program: %s'\n",arg_nodeprog_r);
+   fprintf(f,"  Exit 1\n");
+   fprintf(f,"fi\n");
+   
+   fprintf(f,"cd \"%s\"\n",arg_currdir_r);
+   fprintf(f,"if test $? = 1\nthen\n");
+   fprintf(f,"  Echo 'Cannot propagate this current directory:'\n"); 
+   fprintf(f,"  Echo '%s'\n",arg_currdir_r);
+   fprintf(f,"  Exit 1\n");
+   fprintf(f,"fi\n");
+   
+   if (strcmp(nodetab_setup(nodeno),"*")) {
+     fprintf(f,"%s\n",nodetab_setup(nodeno));
+     fprintf(f,"if test $? = 1\nthen\n");
+     fprintf(f,"  Echo 'this initialization command failed:'\n");
+     fprintf(f,"  Echo '\"%s\"'\n",nodetab_setup(nodeno));
+     fprintf(f,"  Echo 'edit your nodes file to fix it.'\n");
+     fprintf(f,"  Exit 1\n");
+     fprintf(f,"fi\n");
+   }
  
-                 fprintf(f,"rm -f /tmp/charmrun_err.$$\n");
-                 if(arg_verbose) fprintf(f,"Echo 'starting node-program...'\n");  
-                 /* This is the start of the the run-nodeprogram script */
-                 fprintf(f,"(");
-                 
-                 if (arg_debug || arg_debug_no_pause ) {
-                        if ( strcmp(dbg, "gdb") == 0 || strcmp(dbg, "idb") == 0 ) {
-                                  fprintf(f,"cat > /tmp/charmrun_gdb.$$ << END_OF_SCRIPT\n");
-                          if ( strcmp(dbg, "idb") == 0 ) {
-                                        fprintf(f,"set \\$cmdset=\"gdb\"\n");
-                          }
-                          fprintf(f,"shell /bin/rm -f /tmp/charmrun_gdb.$$\n");
-                                  fprintf(f,"handle SIGPIPE nostop noprint\n");
-                                  fprintf(f,"handle SIGWINCH nostop noprint\n");
-                                  fprintf(f,"handle SIGWAITING nostop noprint\n");
-                          if(arg_debug_commands)
-                                fprintf(f,"%s\n", arg_debug_commands);
-                                  fprintf(f,"set args");
-                                  fprint_arg(f,argv);
-                                  fprintf(f,"\n");
-                                  if (arg_debug_no_pause) fprintf(f,"run\n");
-                                  fprintf(f,"END_OF_SCRIPT\n");
-                          if (arg_runscript)
-                                fprintf(f,"\"%s\" ",arg_runscript);
-                                  fprintf(f,"$F_XTERM");
-                                  fprintf(f," -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
-                          if ( strcmp(dbg, "idb") == 0 )
-                                        fprintf(f," -e $F_DBG %s -c /tmp/charmrun_gdb.$$ \n", arg_nodeprog_r);
-                                  else 
-                                        fprintf(f," -e $F_DBG %s -x /tmp/charmrun_gdb.$$ \n", arg_nodeprog_r);
-                                } else if ( strcmp(dbg, "dbx") == 0 ) {
-                                  fprintf(f,"cat > /tmp/charmrun_dbx.$$ << END_OF_SCRIPT\n");
-                                  fprintf(f,"sh /bin/rm -f /tmp/charmrun_dbx.$$\n");
-                                  fprintf(f,"dbxenv suppress_startup_message 5.0\n");
-                                  fprintf(f,"ignore SIGPOLL\n");
-                                  fprintf(f,"ignore SIGPIPE\n");
-                                  fprintf(f,"ignore SIGWINCH\n");
-                                  fprintf(f,"ignore SIGWAITING\n");
-                          if(arg_debug_commands)
-                                fprintf(f,"%s\n", arg_debug_commands);
-                                  fprintf(f,"END_OF_SCRIPT\n");
-                          if (arg_runscript)
-                                fprintf(f,"\"%s\" ",arg_runscript);
-                                  fprintf(f,"$F_XTERM");
-                                  fprintf(f," -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
-                                  fprintf(f," -e $F_DBG %s ",arg_debug_no_pause?"-r":"");
-                          if(arg_debug) {
-                                         fprintf(f,"-c \'runargs ");
-                                         fprint_arg(f,argv);
-                                         fprintf(f,"\' ");
-                          }
-                          fprintf(f, "-s/tmp/charmrun_dbx.$$ %s",arg_nodeprog_r);
-                          if(arg_debug_no_pause) 
-                                         fprint_arg(f,argv);
-                                  fprintf(f,"\n");
-                        } else { 
-                         fprintf(stderr, "Unknown debugger: %s.\n Exiting.\n", 
-                               nodetab_debugger(nodeno));
-                        }
-                 } else if (arg_in_xterm) {
-                       if(arg_verbose)
-                         fprintf(stderr, "Charmrun> node %d: xterm is %s\n", 
-                                         nodeno, nodetab_xterm(nodeno));
-                       fprintf(f,"cat > /tmp/charmrun_inx.$$ << END_OF_SCRIPT\n");
-                       fprintf(f,"#!/bin/sh\n");
-                       fprintf(f,"/bin/rm -f /tmp/charmrun_inx.$$\n");
-                       fprintf(f,"%s", arg_nodeprog_r);
-                       fprint_arg(f,argv);
-                       fprintf(f,"\n");
-                       fprintf(f,"echo 'program exited with code '\\$?\n");
-                       fprintf(f,"read eoln\n");
-                       fprintf(f,"END_OF_SCRIPT\n");
-                       fprintf(f,"chmod 700 /tmp/charmrun_inx.$$\n");
-                       if (arg_runscript)
-                          fprintf(f,"\"%s\" ",arg_runscript);
-                       fprintf(f,"$F_XTERM -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
-                       fprintf(f," -sl 5000");
-                       fprintf(f," -e /tmp/charmrun_inx.$$\n");
-                 } else {
-                       if (arg_runscript)
-                          fprintf(f,"\"%s\" ",arg_runscript);
-                       fprintf(f,"\"%s\" ",arg_nodeprog_r);
-                       fprint_arg(f,argv);
-                       if (nodetab_nice(nodeno) != -100) {
-                         if(arg_verbose) fprintf(stderr, "Charmrun> nice -n %d\n", nodetab_nice(nodeno));
-                         fprintf(f," +nice %d ",nodetab_nice(nodeno));
-                       }
-                       fprintf(f,"\nres=$?\n");
-                       /* If shared libraries fail to load, the program dies without
-                          calling charmrun back.  Since we *have* to close down stdin/out/err,
-                          we have to smuggle this failure information out via a file,
-                          /tmp/charmrun_err.<pid> */
-                       fprintf(f,
-                               "if [ $res -eq 127 ]\n"
-                       "then\n"
-                       "  ( \n" /* Re-run, spitting out errors from a subshell: */
-                       "    \"%s\" \n"
-                       "    ldd \"%s\"\n"
-                       "  ) > /tmp/charmrun_err.$$ 2>&1 \n"
-                       "fi\n",arg_nodeprog_r,arg_nodeprog_r);
-                 }
-                 
-                 /* End the node-program subshell. To minimize the number 
-                        of open ports on the front-end, we must close down rsh;
-                        to do this, we have to close stdin, stdout, stderr, and 
-                        run the subshell in the background. */
-                 fprintf(f,")");
-                 fprintf(f,CLOSE_ALL "\n");
-                 
-                 if (arg_verbose) fprintf(f,"Echo 'rsh phase successful.'\n");
-                 fprintf(f, /* Check for startup errors: */
-                        "sleep 1\n"
-                        "if [ -r /tmp/charmrun_err.$$ ]\n"
-                        "then\n"
-                        "  cat /tmp/charmrun_err.$$ \n"
-                        "  rm -f /tmp/charmrun_err.$$ \n"
-                        "  Exit 1\n"
-                        "fi\n");
-                 fprintf(f,"Exit 0\n");
-               }
+   fprintf(f,"rm -f /tmp/charmrun_err.$$\n");
+   if(arg_verbose) fprintf(f,"Echo 'starting node-program...'\n");  
+   /* This is the start of the the run-nodeprogram script */
+   fprintf(f,"(");
+   
+   if (arg_debug || arg_debug_no_pause ) {
+        if ( strcmp(dbg, "gdb") == 0 || strcmp(dbg, "idb") == 0 ) {
+            fprintf(f,"cat > /tmp/charmrun_gdb.$$ << END_OF_SCRIPT\n");
+          if ( strcmp(dbg, "idb") == 0 ) {
+              fprintf(f,"set \\$cmdset=\"gdb\"\n");
+          }
+          fprintf(f,"shell /bin/rm -f /tmp/charmrun_gdb.$$\n");
+            fprintf(f,"handle SIGPIPE nostop noprint\n");
+            fprintf(f,"handle SIGWINCH nostop noprint\n");
+            fprintf(f,"handle SIGWAITING nostop noprint\n");
+          if(arg_debug_commands)
+            fprintf(f,"%s\n", arg_debug_commands);
+            fprintf(f,"set args");
+            fprint_arg(f,argv);
+            fprintf(f,"\n");
+            if (arg_debug_no_pause) fprintf(f,"run\n");
+            fprintf(f,"END_OF_SCRIPT\n");
+          if (arg_runscript)
+            fprintf(f,"\"%s\" ",arg_runscript);
+            fprintf(f,"$F_XTERM");
+            fprintf(f," -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
+          if ( strcmp(dbg, "idb") == 0 )
+              fprintf(f," -e $F_DBG %s -c /tmp/charmrun_gdb.$$ \n", arg_nodeprog_r);
+            else 
+              fprintf(f," -e $F_DBG %s -x /tmp/charmrun_gdb.$$ \n", arg_nodeprog_r);
+          } else if ( strcmp(dbg, "dbx") == 0 ) {
+            fprintf(f,"cat > /tmp/charmrun_dbx.$$ << END_OF_SCRIPT\n");
+            fprintf(f,"sh /bin/rm -f /tmp/charmrun_dbx.$$\n");
+            fprintf(f,"dbxenv suppress_startup_message 5.0\n");
+            fprintf(f,"ignore SIGPOLL\n");
+            fprintf(f,"ignore SIGPIPE\n");
+            fprintf(f,"ignore SIGWINCH\n");
+            fprintf(f,"ignore SIGWAITING\n");
+          if(arg_debug_commands)
+            fprintf(f,"%s\n", arg_debug_commands);
+            fprintf(f,"END_OF_SCRIPT\n");
+          if (arg_runscript)
+            fprintf(f,"\"%s\" ",arg_runscript);
+            fprintf(f,"$F_XTERM");
+            fprintf(f," -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
+            fprintf(f," -e $F_DBG %s ",arg_debug_no_pause?"-r":"");
+          if(arg_debug) {
+               fprintf(f,"-c \'runargs ");
+               fprint_arg(f,argv);
+               fprintf(f,"\' ");
+          }
+          fprintf(f, "-s/tmp/charmrun_dbx.$$ %s",arg_nodeprog_r);
+          if(arg_debug_no_pause) 
+               fprint_arg(f,argv);
+            fprintf(f,"\n");
+        } else { 
+         fprintf(stderr, "Unknown debugger: %s.\n Exiting.\n", 
+           nodetab_debugger(nodeno));
+        }
+   } else if (arg_in_xterm) {
+     if(arg_verbose)
+       fprintf(stderr, "Charmrun> node %d: xterm is %s\n", 
+               nodeno, nodetab_xterm(nodeno));
+     fprintf(f,"cat > /tmp/charmrun_inx.$$ << END_OF_SCRIPT\n");
+     fprintf(f,"#!/bin/sh\n");
+     fprintf(f,"/bin/rm -f /tmp/charmrun_inx.$$\n");
+     fprintf(f,"%s", arg_nodeprog_r);
+     fprint_arg(f,argv);
+     fprintf(f,"\n");
+     fprintf(f,"echo 'program exited with code '\\$?\n");
+     fprintf(f,"read eoln\n");
+     fprintf(f,"END_OF_SCRIPT\n");
+     fprintf(f,"chmod 700 /tmp/charmrun_inx.$$\n");
+     if (arg_runscript)
+        fprintf(f,"\"%s\" ",arg_runscript);
+     fprintf(f,"$F_XTERM -title 'Node %d (%s)' ",nodeno,nodetab_name(nodeno));
+     fprintf(f," -sl 5000");
+     fprintf(f," -e /tmp/charmrun_inx.$$\n");
+   } else {
+     if (arg_runscript)
+        fprintf(f,"\"%s\" ",arg_runscript);
+     if (arg_no_va_rand) {
+       if(arg_verbose) fprintf(stderr, "Charmrun> setarch -R is used.\n");
+       fprintf(f,"setarch `uname -m` -R ");
+     }
+     fprintf(f,"\"%s\" ",arg_nodeprog_r);
+     fprint_arg(f,argv);
+     if (nodetab_nice(nodeno) != -100) {
+       if(arg_verbose) fprintf(stderr, "Charmrun> nice -n %d\n", nodetab_nice(nodeno));
+       fprintf(f," +nice %d ",nodetab_nice(nodeno));
+     }
+     fprintf(f,"\nres=$?\n");
+     /* If shared libraries fail to load, the program dies without
+        calling charmrun back.  Since we *have* to close down stdin/out/err,
+        we have to smuggle this failure information out via a file,
+        /tmp/charmrun_err.<pid> */
+     fprintf(f,
+       "if [ $res -eq 127 ]\n"
+       "then\n"
+       "  ( \n" /* Re-run, spitting out errors from a subshell: */
+       "    \"%s\" \n"
+       "    ldd \"%s\"\n"
+       "  ) > /tmp/charmrun_err.$$ 2>&1 \n"
+       "fi\n",arg_nodeprog_r,arg_nodeprog_r);
+   }
+   
+   /* End the node-program subshell. To minimize the number 
+      of open ports on the front-end, we must close down rsh;
+      to do this, we have to close stdin, stdout, stderr, and 
+      run the subshell in the background. */
+   fprintf(f,")");
+   fprintf(f," < /dev/null 1> /dev/null 2> /dev/null");
+   if (!arg_mpiexec)
+       fprintf(f, " &");
+   fprintf(f, "\n");
+   
+   if (arg_verbose) fprintf(f,"Echo 'rsh phase successful.'\n");
+   fprintf(f, /* Check for startup errors: */
+      "sleep 1\n"
+      "if [ -r /tmp/charmrun_err.$$ ]\n"
+      "then\n"
+      "  cat /tmp/charmrun_err.$$ \n"
+      "  rm -f /tmp/charmrun_err.$$ \n"
+      "  Exit 1\n"
+      "fi\n");
+   fprintf(f,"Exit 0\n");
+ }
  
  
-               /* use the command "size" to get information about the position of the ".data"
-                  and ".bss" segments inside the program memory */
-               void read_global_segments_size() {
-                 char **rshargv;
-                 int childPid;
-                 /* find the node-program */
-                 arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(0), nodetab_ext(0));
-                 rshargv = (char **)malloc(sizeof(char *)*6);
-                 rshargv[0]=nodetab_shell(0);
-                 rshargv[1]=nodetab_name(0);
-                 rshargv[2]="-l";
-                 rshargv[3]=nodetab_login(0);
-                 rshargv[4] = (char *)malloc(sizeof(char)*9+strlen(arg_nodeprog_r));
-                 sprintf(rshargv[4],"size -A %s",arg_nodeprog_r);
-                 rshargv[5]=0;
-                 childPid = fork();
-                 if (childPid < 0) {
-                       perror("ERROR> getting the size of the global variables segments"); exit(1);
-                 } else if (childPid == 0) {
-                       /* child process */
-                       dup2(2, 1);
-                       /*printf("executing: \"%s\" \"%s\" \"%s\" \"%s\" \"%s\"\n",rshargv[0],rshargv[1],rshargv[2],rshargv[3],rshargv[4]);*/
-                       execvp(rshargv[0], rshargv);
-                       fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
-                       exit(1);
-                 } else {
-                       /* else we are in the parent */
-                       free(rshargv[4]);
-                       free(rshargv);
-                       waitpid(childPid, NULL, 0);
-                 }
-               }
+ /* use the command "size" to get information about the position of the ".data"
+    and ".bss" segments inside the program memory */
+ void read_global_segments_size() {
+   char **rshargv;
+   int childPid;
  
-               /* open a rsh connection with processor 0 and open a gdb session for info */
-               void open_gdb_info() {
-                 char **rshargv;
-                 int fdin[2];
-                 int fdout[2];
-                 int fderr[2];
-                 int i;
+   /* find the node-program */
+   arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(0), nodetab_ext(0));
+   rshargv = (char **)malloc(sizeof(char *)*6);
+   rshargv[0]=nodetab_shell(0);
+   rshargv[1]=nodetab_name(0);
+   rshargv[2]="-l";
+   rshargv[3]=nodetab_login(0);
+   rshargv[4] = (char *)malloc(sizeof(char)*9+strlen(arg_nodeprog_r));
+   sprintf(rshargv[4],"size -A %s",arg_nodeprog_r);
+   rshargv[5]=0;
+   childPid = fork();
+   if (childPid < 0) {
+     perror("ERROR> getting the size of the global variables segments"); exit(1);
+   } else if (childPid == 0) {
+     /* child process */
+     dup2(2, 1);
+     /*printf("executing: \"%s\" \"%s\" \"%s\" \"%s\" \"%s\"\n",rshargv[0],rshargv[1],rshargv[2],rshargv[3],rshargv[4]);*/
+     execvp(rshargv[0], rshargv);
+     fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
+     exit(1);
+   } else {
+     /* else we are in the parent */
+     free(rshargv[4]);
+     free(rshargv);
+     waitpid(childPid, NULL, 0);
+   }
+ }
  
-                 /* find the node-program */
-                 arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(0), nodetab_ext(0));
+ /* open a rsh connection with processor 0 and open a gdb session for info */
+ void open_gdb_info() {
+   char **rshargv;
+   int fdin[2];
+   int fdout[2];
+   int fderr[2];
+   int i;
+   /* find the node-program */
+   arg_nodeprog_r = pathextfix(arg_nodeprog_a, nodetab_pathfixes(0), nodetab_ext(0));
  
-                 rshargv = (char **)malloc(sizeof(char *)*6);
-                 rshargv[0]=nodetab_shell(0);
-                 rshargv[1]=nodetab_name(0);
-                 rshargv[2]="-l";
-                 rshargv[3]=nodetab_login(0);
-                 rshargv[4] = (char *)malloc(sizeof(char)*8+strlen(arg_nodeprog_r));
-                 sprintf(rshargv[4],"gdb -q %s",arg_nodeprog_r);
-                 rshargv[5]=0;
+   rshargv = (char **)malloc(sizeof(char *)*6);
+   rshargv[0]=nodetab_shell(0);
+   rshargv[1]=nodetab_name(0);
+   rshargv[2]="-l";
+   rshargv[3]=nodetab_login(0);
+   rshargv[4] = (char *)malloc(sizeof(char)*8+strlen(arg_nodeprog_r));
+   sprintf(rshargv[4],"gdb -q %s",arg_nodeprog_r);
+   rshargv[5]=0;
  
 -  pipe(fdin);
 -  pipe(fdout);
 -  pipe(fderr);
 -
 -  gdb_info_pid = fork();
 -  if (gdb_info_pid < 0) {
 -    perror("ERROR> starting info gdb"); exit(1);
 -  } else if (gdb_info_pid == 0) {
 -    /* child process */
 -    close(fdin[1]);
 -    close(fdout[0]);
 -    close(fderr[0]);
 -    printf("executing: \"%s\" \"%s\" \"%s\" \"%s\" \"%s\"\n",rshargv[0],rshargv[1],rshargv[2],rshargv[3],rshargv[4]);
 -    dup2(fdin[0],0);
 -    dup2(fdout[1],1);
 -    dup2(fderr[1],2);
 -    for(i=3; i<1024; i++) close(i);
 -    execvp(rshargv[0], rshargv);
 -    fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
 -    exit(1);
 -  }
 -  /* else we are in the parent */
 -  free(rshargv[4]);
 -  free(rshargv);
 -  gdb_info_std[0] = fdin[1];
 -  gdb_info_std[1] = fdout[0];
 -  gdb_info_std[2] = fderr[0];
 -  close(fdin[0]);
 -  close(fdout[1]);
 -  close(fderr[1]);
 -}
 +                pipe(fdin);
 +                pipe(fdout);
 +                pipe(fderr);
 +
 +                gdb_info_pid = fork();
 +                if (gdb_info_pid < 0) {
 +                      perror("ERROR> starting info gdb"); exit(1);
 +                } else if (gdb_info_pid == 0) {
 +                      /* child process */
 +                      close(fdin[1]);
 +                      close(fdout[0]);
 +                      close(fderr[0]);
 +                      printf("executing: \"%s\" \"%s\" \"%s\" \"%s\" \"%s\"\n",rshargv[0],rshargv[1],rshargv[2],rshargv[3],rshargv[4]);
 +                      dup2(fdin[0],0);
 +                      dup2(fdout[1],1);
 +                      dup2(fderr[1],2);
 +                      for(i=3; i<1024; i++) close(i);
 +                      execvp(rshargv[0], rshargv);
 +                      fprintf(stderr,"Charmrun> Couldn't find rsh program '%s'!\n",rshargv[0]);
 +                      exit(1);
 +                }
 +                /* else we are in the parent */
 +                free(rshargv[4]);
 +                free(rshargv);
 +                gdb_info_std[0] = fdin[1];
 +                gdb_info_std[1] = fdout[0];
 +                gdb_info_std[2] = fderr[0];
 +                close(fdin[0]);
 +                close(fdout[1]);
 +                close(fderr[1]);
 +              }
 +#ifdef HSTART
 +              void start_next_level_charmruns()
 +              {
 +                      
 +                 static char buf[1024];
 +                 char * nodeprog_name = strrchr(arg_nodeprog_a, '/');
 +                 nodeprog_name[0] = 0;
 +                 sprintf(buf,"%s%s%s",arg_nodeprog_a,DIRSEP,"charmrun");
 +                 arg_nodeprog_a = strdup(buf);
 +              
 +                 int client;
 +                       int nextIndex =0;
 +                       client=0;
 +                       while(nextIndex<branchfactor){
 +                       /* need to index into unique_table*/
 +                       int rank0no = nodetab_unique_table[client];
 +                       int pe=nodetab_rank0_table[rank0no];
 +                       FILE *f;
 +                       char startScript[200];
 +                       sprintf(startScript,"/tmp/charmrun.%d.%d",getpid(),pe);
 +                       f=fopen(startScript,"w");
 +                       if (f==NULL) {
 +                         /* now try current directory */
 +                         sprintf(startScript,"charmrun.%d.%d",getpid(),pe);
 +                         f=fopen(startScript,"w");
 +                         if (f==NULL) {
 +                               fprintf(stderr,"Charmrun> Can not write file %s!\n", startScript);
 +                               exit(1);
 +                         }
 +                       }
 +                       rsh_script(f,pe,client,arg_argv,0);
 +                       fclose(f);
 +                      if (!rsh_pids)
 +                         rsh_pids=(int *)malloc(sizeof(int)*branchfactor);
 +                       rsh_pids[nextIndex++] = rsh_fork(pe,startScript);
 +                               client += nodes_per_child;
  
 -/* returns pid */
 -void start_one_node_rsh(int rank0no)
 -{
 -     int pe=nodetab_rank0_table[rank0no];
 -     FILE *f;
 -     char startScript[200];
 -     sprintf(startScript,"/tmp/charmrun.%d.%d",getpid(),pe);
 -     f=fopen(startScript,"w");
 -     if (f==NULL) {
 -       /* now try current directory */
 -       sprintf(startScript,"charmrun.%d.%d",getpid(),pe);
 -       f=fopen(startScript,"w");
 -       if (f==NULL) {
 -       fprintf(stderr,"Charmrun> Can not write file %s!\n", startScript);
 -       exit(1);
 -       }
 -     }
 -     rsh_script(f,pe,rank0no,arg_argv,0);
 -     fclose(f);
 -     if (!rsh_pids)
 -       rsh_pids=(int *)malloc(sizeof(int)*nodetab_rank0_size);
 -     rsh_pids[rank0no] = rsh_fork(pe,startScript);
 -}
 +                      }
 +              }
 +#endif
 +                                                                              
 +              /* returns pid */
 +              void start_one_node_rsh(int rank0no)
 +              {
 +                       int pe=nodetab_rank0_table[rank0no];
 +                       FILE *f;
 +                       char startScript[200];
 +                       sprintf(startScript,"/tmp/charmrun.%d.%d",getpid(),pe);
 +                       f=fopen(startScript,"w");
 +                       if (f==NULL) {
 +                         /* now try current directory */
 +                         sprintf(startScript,"charmrun.%d.%d",getpid(),pe);
 +                         f=fopen(startScript,"w");
 +                         if (f==NULL) {
 +                               fprintf(stderr,"Charmrun> Can not write file %s!\n", startScript);
 +                               exit(1);
 +                         }
 +                       }
 +                       rsh_script(f,pe,rank0no,arg_argv,0);
 +                       fclose(f);
 +                       if (!rsh_pids)
 +                         rsh_pids=(int *)malloc(sizeof(int)*nodetab_rank0_size);
 +                       rsh_pids[rank0no] = rsh_fork(pe,startScript);
 +              }
  
              int start_set_node_rsh(int client) {
-                       /* a search function could be inserted here instead of sequential lookup for more complex node lists (e.g. interleaving) */
-                       int clientgroup;
+ int start_set_node_rsh(int client) {
+       /* a search function could be inserted here instead of sequential lookup for more complex node lists (e.g. interleaving) */
+       int clientgroup;
  #if CMK_SMP || defined(_WIN32)
        clientgroup=client+1; /* smp already handles this functionality */
 +#else
 +
 +#ifdef HSTART
 +      if(!arg_scalable_start && !arg_hierarchical_start)
 +              clientgroup=client+1; /* only launch 1 core per rsh call */
 +      else {
 +              clientgroup=client;
 +              do {
 +                      clientgroup++; /* add one more client to group if not greater than nodes and shares the same name as client */
 +                      if(clientgroup>=nodetab_rank0_size)
 +                break;
 +            if(arg_scalable_start&&!arg_hierarchical_start)
 +                if(strcmp(nodetab_name(clientgroup),nodetab_name(client)))
 +                        break;
 +            /*Hierarchical-start*/
 +            if(strcmp(nodetab_name(nodetab_rank0_table[clientgroup]),nodetab_name(nodetab_rank0_table[client])))
 +                        break;
 +        }
 +        while(1);
 +      }
 +
  #else
        if(!arg_scalable_start)
                clientgroup=client+1; /* only launch 1 core per rsh call */
Simple merge
Simple merge