CrayXC: Add target gni-crayxc and mpi-crayxc for the new cray system based on
authorNikhil Jain <nikhil@illinois.edu>
Sun, 10 Mar 2013 01:04:39 +0000 (17:04 -0800)
committerNikhil Jain <nikhil@illinois.edu>
Thu, 14 Mar 2013 04:54:36 +0000 (21:54 -0700)
Aries interconnect. All the code is borrowed from CrayXE. In the process also
remove the GEMINI tag from CRAYXE build and call it GNI instead.

51 files changed:
src/arch/gemini_gni-crayxe/conv-mach.sh
src/arch/gemini_gni/conv-common.h
src/arch/gemini_gni/machine.c
src/arch/gni-crayxc/charmrun [new file with mode: 0755]
src/arch/gni-crayxc/conv-mach-papi.h [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach-papi.sh [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach-pxshm.h [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach-pxshm.sh [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach-smp.h [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach-smp.sh [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach-xpmem.h [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach-xpmem.sh [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach.h [new file with mode: 0644]
src/arch/gni-crayxc/conv-mach.sh [new file with mode: 0644]
src/arch/gni/Makefile.machine [new file with mode: 0644]
src/arch/gni/README [new file with mode: 0644]
src/arch/gni/conv-common.h [new file with mode: 0644]
src/arch/gni/conv-common.sh [new file with mode: 0644]
src/arch/gni/conv-mach-hugepages.h [new file with mode: 0644]
src/arch/gni/conv-mach-hugepages.sh [new file with mode: 0644]
src/arch/gni/conv-mach-mlogft.h [new file with mode: 0644]
src/arch/gni/conv-mach-mlogft.sh [new file with mode: 0644]
src/arch/gni/conv-mach-syncft.h [new file with mode: 0644]
src/arch/gni/conv-mach-syncft.sh [new file with mode: 0644]
src/arch/gni/cray_tlbhack.c [new file with mode: 0644]
src/arch/gni/machine-cmidirect.c [new file with mode: 0644]
src/arch/gni/machine-persistent.c [new file with mode: 0644]
src/arch/gni/machine-persistent.h [new file with mode: 0644]
src/arch/gni/machine.c [new file with mode: 0644]
src/arch/mpi-crayxc/conv-mach-cuda.h [new file with mode: 0644]
src/arch/mpi-crayxc/conv-mach-cuda.sh [new file with mode: 0644]
src/arch/mpi-crayxc/conv-mach-smp.h [new file with mode: 0644]
src/arch/mpi-crayxc/conv-mach-smp.sh [new file with mode: 0644]
src/arch/mpi-crayxc/conv-mach.h [new file with mode: 0644]
src/arch/mpi-crayxc/conv-mach.sh [new file with mode: 0644]
src/arch/mpi-crayxc/special.sh [new file with mode: 0755]
src/arch/util/machine-pxshm.c
src/arch/util/machine-xpmem.c
src/arch/util/mempool.c
src/arch/util/mempool.h
src/ck-core/init.C
src/conv-core/cmidirect.h
src/conv-core/conv-config.h
src/conv-core/convcore.c
src/conv-core/converse.h
src/conv-core/cpuaffinity.c
src/conv-core/cputopology.C
src/conv-core/isomalloc.c
src/conv-core/memory.c
src/scripts/conv-config.sh
src/util/CrayNid.c

index a64fc7faa999d15d4174a29cd70399232a27b18b..11c2f873f464199b4aac2bdfbb4a95f01eca2d50 100644 (file)
@@ -1,4 +1,4 @@
-GEMINI_CRAYXE=1
+GNI_CRAYXE=1
 PMI_CFLAGS=`pkg-config --cflags cray-pmi`
 PMI_LIBS=`pkg-config --libs cray-pmi`
 UGNI_CFLAGS=`pkg-config --cflags cray-ugni`
index 3db0e04ee9ba53563206c1a89700886cee9d0844..43bf73785cc7a2bfa610a845c8025cb0bb096a76 100644 (file)
@@ -3,7 +3,7 @@
 
 #define CMK_HAS_PARTITION                                  1
 
-#define CMK_CONVERSE_GEMINI_UGNI                           1
+#define CMK_CONVERSE_UGNI                                  1
 
 #define CMK_CMIDELIVERS_USE_COMMON_CODE                    1
 
index daf5c87ced6d20f9ea33173bc7f1fa2ad32abe5a..652bb89285902def4169428829f0bea2e0169ff8 100644 (file)
@@ -1,6 +1,6 @@
 
 /** @file
- * Gemini GNI machine layer
+ * GNI machine layer
  *
  * Author:   Yanhua Sun
              Gengbin Zheng
diff --git a/src/arch/gni-crayxc/charmrun b/src/arch/gni-crayxc/charmrun
new file mode 100755 (executable)
index 0000000..5ee96e9
--- /dev/null
@@ -0,0 +1,303 @@
+#!/bin/sh
+#
+# Conv-host for MPI:
+#  Translates +pN-style conv-host options into 
+# mpirun -npN options.
+
+args=""
+pes=1
+ppn=1
+machinefile=""
+
+while [ $# -gt 0 ]
+do
+       case $1 in
+       +ppn|++ppn)
+               args=$args" +ppn "$2
+               ppn=$2
+               shift
+               ;;
+       +ppn[0-9]*)
+               args=$args" "$1
+               ppn=`echo $1 | awk '{print substr($1,5)}'`
+               ;;
+       ++ppn[0-9]*)
+               args=$args" "$1
+               ppn=`echo $1 | awk '{print substr($1,6)}'`
+               ;;
+       +p)
+               pes=$2
+               shift
+               ;;
+       +pemap)
+               args=$args" "$1" "$2
+               shift
+               ;;
+       +p[0-9]*)
+               pes=`echo $1 | awk '{print substr($1,3)}'`
+               ;;
+        -machinefile)
+               machinefile=$2
+               args=" "$1" "$2" "$args
+               shift
+               ;;
+       *) 
+               args=$args" "$1
+               ;;
+       esac
+       shift
+done
+
+rem=`expr $pes % $ppn`
+quot=`expr $pes / $ppn`
+if [ $rem -ne 0 ];
+then
+  printf "p = $pes should be a multiple of ppn = $ppn\n"
+  exit 1
+else
+  pes=$quot
+fi 
+
+printf "\nRunning on $pes processors: $args\n"
+
+
+if [ -n "$PBS_NODEFILE" ]
+then
+# we are in a job shell
+  aprun=`which aprun 2>/dev/null`
+  if test -n "$aprun"
+  then
+    echo  aprun -n $pes -d `expr $ppn + 1` $args
+    $aprun -n $pes -d `expr $ppn + 1` $args
+  else
+    mpirun_cmd=`which mpirun 2>/dev/null`
+    if test -n "$mpirun_cmd"
+    then
+      if echo $mpirun_cmd | grep 'mvapich2'  > /dev/null 2>/dev/null
+      then
+        # if daemon not started, start it
+        if ! mpdtrace > /dev/null 2>/dev/null
+        then
+          mvapich2-start-mpd
+        fi
+        mpirun -np $pes $args
+        #    mpdallexit
+      else   # normal case
+        test -z "$machinefile" && args=-machinefile" "$PBS_NODEFILE" "$args
+        echo mpirun -np $pes $args
+        mpirun -np $pes $args
+      fi
+    else
+      echo "Charmrun> can not locate mpirun in order to run the program."
+      exit 1
+    fi
+  fi
+elif [ -n "$LSB_HOSTS" ]
+then
+# Tungsten
+  echo cmpirun -lsf -poll -no_smp -gm_long 200000 $args 
+  cmpirun -lsf -poll -no_smp -gm_long 200000 $args 
+elif [ -n "$PBS_QUEUE" -o -n "$LSF_QUEUE" ]
+then
+# Interactive mode: create, and submit a batch job
+        script="charmrun_script.$$.sh"
+        indir=`pwd`
+        output="$indir/charmrun_script.$$.stdout"
+        result="$indir/charmrun_script.$$.result"
+       rm -f $result
+# Some machine specific 
+       USE_LSF=0
+# 10 minutes   
+       walllimit=10
+       queue_stat=qstat
+       queue_qsub=qsub
+       queue_kill=qdel
+       hostname=`hostname`
+       case "$hostname" in
+        hopper*)
+               ncpus="#PBS -l mppwidth=$ncores"
+                ;;
+       tg-login*|honest*.ncsa.uiuc.edu)
+               # always ppn=2
+               nodes=`expr \( $pes + 1 \) / 2`
+               test $pes -eq 1 && ppns=1 || ppns=2
+               ppn='#PBS -l nodes='$nodes':ppn='$ppns
+               extra='-machinefile $PBS_NODEFILE'
+               ;;
+       co-login*.ncsa.uiuc.edu)
+               mem='#PBS -l mem=500mb'
+               ncpus="#PBS -l ncpus=$pes"
+               ;;
+       tun*)
+               USE_LSF=1
+               queue_stat=bjobs
+               queue_qsub=bsub
+               queue_kill=bkill
+               ;;
+       abe*)
+               # always ppn=2
+               nodes=`expr \( $pes + 1 \) / 2`
+               test $pes -eq 1 && ppns=1 || ppns=2
+               ppn='#PBS -l nodes='$nodes':ppn='$ppns
+               extra='-machinefile $PBS_NODEFILE'
+               ;;
+        kraken*)
+                ncores=`expr \( $pes + 11 \) / 12 \* 12`
+               ncpus="#PBS -l size=$ncores"
+               ppn=''
+               ;;
+       *)
+               ncpus="#PBS -l ncpus=$pes"
+               ;;
+       esac
+       if test $USE_LSF -eq 0
+       then
+          mpirun=`which aprun 2>/dev/null`
+          npcmd="-n "
+          if test -z "$mpirun"
+          then
+           mpirun=`which mpirun 2>/dev/null`
+            npcmd="-np "
+          fi
+          cat > $script << EOF
+#!/bin/sh
+# This is a charmrun-generated PBS batch job script.
+# The lines starting with #PBS are queuing system flags:
+#
+$ppn
+#
+$ncpus
+#
+#PBS -l walltime=$walllimit:00
+#
+$mem
+#
+#PBS -q $PBS_QUEUE
+#
+#PBS -N autobuild
+#
+#PBS -j oe
+#
+#PBS -o $output
+
+cd $indir
+
+cat \$PBS_NODEFILE
+echo $mpirun $npcmd $pes $extra $args
+$mpirun $npcmd $pes $extra $args
+
+# Save mpirun exit status
+status=\$?
+echo \$status > $result
+EOF
+       else
+#  use LSF
+         mpirun="cmpirun -lsf -poll -no_smp -gm_long 200000"
+          cat > $script << EOF
+#!/bin/sh
+# This is a charmrun-generated PBS batch job script.
+# The lines starting with #PBS are queuing system flags:
+#
+#BSUB -J autobuild
+#BSUB -W 0:$walllimit
+#BSUB -n $pes
+#BSUB -o $output
+
+cd $indir
+echo \$LSB_MCPU_HOSTS
+$mpirun $args
+# Save mpirun exit status
+status=\$?
+echo \$status > $result
+EOF
+       fi
+
+End() {
+       echo "Charmrun> $queue_kill $jobid ..."
+       $queue_kill $jobid
+       rm -f $script
+       exit $1
+}
+
+        echo "Submitting batch job for> $mpirun -np $pes $args"
+        echo " using the command> $queue_qsub $script"
+        chmod 755 $script
+       while [ -z "$jobid" ]
+       do
+         [ $USE_LSF = 0 ] && jobid=`$queue_qsub $script|tail -1`
+         [ $USE_LSF = 1 ] && jobid=`$queue_qsub < $script|tail -1|sed -e 's/[^0-9]*//g'`
+       done
+       echo "Job enqueued under job ID $jobid"
+# kill job if interrupted
+       trap 'End 1' 2 3
+       retry=0
+# Wait for the job to complete, by checking its status
+        while [ true ]
+        do
+                $queue_stat $jobid > tmp.$$
+               exitstatus=$?
+                if test -f $output
+                then
+# The job is done-- print its output
+                        rm tmp.$$
+# When job hangs, result file does not exist
+                       test -f $result && status=`cat $result` || status=1
+                       test $status -eq 0 && status=`grep 'End of program' $output > /dev/null 2>&1`
+                       cat $output
+                       rm -f $result
+                       test -f $status && rm -f $script $output
+                       exit $status
+                fi
+# The job is still queued or running-- print status and wait
+                tail -1 tmp.$$
+                rm tmp.$$
+# Job ID may not exist now
+               if test $exitstatus -ne 0
+               then
+# retry a few times when error occurs
+                       retry=`expr $retry + 1`
+                       if test $retry -gt 6
+                       then
+                               echo "Charmrun> too many errors, abort!"
+                               exit 1
+                       else
+                               sleep 15
+                       fi
+               else
+# job still in queue
+                       retry=0
+                       sleep 20
+               fi
+        done
+else
+  mpirun_cmd=`which mpirun 2>/dev/null`
+  if test -n "$mpirun_cmd"
+  then
+    [ -n "$MPI_MACHINEFILE" ] && args=" -machinefile $MPI_MACHINEFILE $args"
+    setarch_cmd=`which setarch 2>/dev/null`
+    if [ -n "$setarch_cmd" -a -x "$setarch_cmd" ]
+    then
+      # Disables randomization of the virtual address  space  (turns  on
+      #          ADDR_NO_RANDOMIZE).
+      cur_arch=`uname -m`
+      echo "charmrun>  $setarch_cmd $cur_arch -R  mpirun -np $pes $args"
+      $setarch_cmd $cur_arch -R  mpirun -np $pes $args
+    else
+      echo "charmrun> mpirun -np $pes $args"
+      mpirun -np $pes $args
+    fi
+  else
+    mpiexec_cmd=`which mpiexec 2>/dev/null`
+    if test -n "$mpiexec_cmd"
+    then
+      echo "charmrun> $mpiexec_cmd -n $pes $args"
+      echo
+      "$mpiexec_cmd" -n $pes $args
+    else
+      echo "Don't know how to run MPI program."
+      exit 1
+    fi
+  fi
+fi
+
+
diff --git a/src/arch/gni-crayxc/conv-mach-papi.h b/src/arch/gni-crayxc/conv-mach-papi.h
new file mode 100644 (file)
index 0000000..68f51a1
--- /dev/null
@@ -0,0 +1,6 @@
+// Chee Wai 3/11/2004
+// This is really stupid, instead of being able to say "#include <papi.h>"
+// here, I am forced to not say anything and wait for the configure script
+// to decide if the library exists before adding the line to conv-mach-opt.h
+//
+// If this is not the intended idiom, then please enlighten me.
diff --git a/src/arch/gni-crayxc/conv-mach-papi.sh b/src/arch/gni-crayxc/conv-mach-papi.sh
new file mode 100644 (file)
index 0000000..518f895
--- /dev/null
@@ -0,0 +1,10 @@
+CMK_USE_PAPI=true
+USE_SPP_PAPI=true
+#you should run module load papi
+PAPI_LIBDIR="/opt/cray/papi/4.3.0.1/perf_events/no-cuda/lib"
+PAPI_INCDIR="/opt/cray/papi/4.3.0.1/perf_events/no-cuda/include"
+CMK_INCDIR="$CMK_INCDIR -I$PAPI_INCDIR"
+CMK_LIBDIR="-L $PAPI_LIBDIR"
+CMK_LD="$CMK_LD -Wl,-rpath,$PAPI_LIBDIR"
+CMK_LDXX="$CMK_LDXX -Wl,-rpath,$PAPI_LIBDIR" 
+CMK_LIBS="$CMK_LIBS -lpapi"
diff --git a/src/arch/gni-crayxc/conv-mach-pxshm.h b/src/arch/gni-crayxc/conv-mach-pxshm.h
new file mode 100644 (file)
index 0000000..44d47ab
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef _CONV_MACH_PXSHM_
+#define _CONV_MACH_PXSHM_
+
+#undef CMK_USE_PXSHM
+#define CMK_USE_PXSHM                  1
+
+#undef CMK_IMMEDIATE_MSG
+#define CMK_IMMEDIATE_MSG       1
+
+#undef CMK_BROADCAST_HYPERCUBE
+#define CMK_BROADCAST_HYPERCUBE                                   1
+
+#undef CMK_WHEN_PROCESSOR_IDLE_USLEEP
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP  0
+
+#define PXSHM_LOCK                      1
+
+#endif
diff --git a/src/arch/gni-crayxc/conv-mach-pxshm.sh b/src/arch/gni-crayxc/conv-mach-pxshm.sh
new file mode 100644 (file)
index 0000000..c3b7193
--- /dev/null
@@ -0,0 +1,5 @@
+
+if test -z "$PGCC"
+then
+CMK_LIBS="$CMK_LIBS -lrt"
+fi
diff --git a/src/arch/gni-crayxc/conv-mach-smp.h b/src/arch/gni-crayxc/conv-mach-smp.h
new file mode 100644 (file)
index 0000000..0e85cc1
--- /dev/null
@@ -0,0 +1,17 @@
+#define CMK_SMP                                            1
+
+#undef CMK_NODE_QUEUE_AVAILABLE
+#define CMK_NODE_QUEUE_AVAILABLE                           1
+
+#undef CMK_SHARED_VARS_UNAVAILABLE
+#undef CMK_SHARED_VARS_POSIX_THREADS_SMP
+#define CMK_SHARED_VARS_UNAVAILABLE                        0
+#define CMK_SHARED_VARS_POSIX_THREADS_SMP                  1
+
+#undef CMK_THREADS_USE_CONTEXT
+#define CMK_THREADS_USE_CONTEXT                            1
+
+
+#if ! CMK_GCC_X86_ASM
+#define CMK_PCQUEUE_LOCK                                   1
+#endif
diff --git a/src/arch/gni-crayxc/conv-mach-smp.sh b/src/arch/gni-crayxc/conv-mach-smp.sh
new file mode 100644 (file)
index 0000000..3b922a2
--- /dev/null
@@ -0,0 +1,11 @@
+CMK_SMP=1
+CMK_DEFS=' -D_REENTRANT '
+CMK_CPP_C="$CMK_CPP_C $CMK_DEFS"
+CMK_CC="$CMK_CC $CMK_DEFS"
+CMK_CC_RELIABLE="$CMK_CC_RELIABLE $CMK_DEFS "
+CMK_CC_FASTEST="$CMK_CC_FASTEST $CMK_DEFS "
+CMK_CXX="$CMK_CXX  $CMK_DEFS "
+CMK_CXXPP="$CMK_CXXPP $CMK_DEFS "
+CMK_LD="$CMK_LD $CMK_DEFS "
+CMK_LDXX="$CMK_LDXX $CMK_DEFS "
+CMK_LIBS="$CMK_LIBS -lpthread -lrt"
diff --git a/src/arch/gni-crayxc/conv-mach-xpmem.h b/src/arch/gni-crayxc/conv-mach-xpmem.h
new file mode 100644 (file)
index 0000000..39cc4d8
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef _CONV_MACH_XPMEM_
+#define  _CONV_MACH_XPMEM
+
+#undef CMK_USE_PXSHM
+#undef CMK_USE_XPMEM
+#define CMK_USE_XPMEM                  1
+
+#undef CMK_IMMEDIATE_MSG
+#define CMK_IMMEDIATE_MSG       0
+
+#undef CMK_BROADCAST_HYPERCUBE
+#define CMK_BROADCAST_HYPERCUBE                                   1
+
+#undef CMK_WHEN_PROCESSOR_IDLE_USLEEP
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP  0
+
+#define XPMEM_LOCK                      1
+
+#endif
diff --git a/src/arch/gni-crayxc/conv-mach-xpmem.sh b/src/arch/gni-crayxc/conv-mach-xpmem.sh
new file mode 100644 (file)
index 0000000..a269d3c
--- /dev/null
@@ -0,0 +1 @@
+#CMK_LIBS="$CMK_LIBS -lrt"
diff --git a/src/arch/gni-crayxc/conv-mach.h b/src/arch/gni-crayxc/conv-mach.h
new file mode 100644 (file)
index 0000000..f918ee3
--- /dev/null
@@ -0,0 +1,106 @@
+#ifndef _CONV_MACH_H
+#define _CONV_MACH_H
+
+#define CMK_CRAYXC                                         1
+
+// for cray xe we use the known conflict free counter set from the SPP project
+#define USE_SPP_PAPI                                       1
+
+/* 1 if the machine has a function called "getpagesize()", 0 otherwise .
+   used in the memory files of converse */
+#define CMK_GETPAGESIZE_AVAILABLE                          1
+#define CMK_MEMORY_PAGESIZE                                8192
+#define CMK_MEMORY_PROTECTABLE                             0
+
+/* defines which version of memory handlers should be used.
+   used in conv-core/machine.c */
+#define CMK_MALLOC_USE_GNU_MALLOC                          0
+#define CMK_MALLOC_USE_OS_BUILTIN                          1
+
+#define CMI_IO_BUFFER_EXPLICIT                             0
+#define CMI_IO_FLUSH_USER                                  0
+
+/* specifies if there is a node queue. it is used in convcore.c and it is
+   tipically set to 1 in smp versions */
+#define CMK_NODE_QUEUE_AVAILABLE                           0
+
+/* the following definitions set the type of shared variables to be used. only
+   one of them must be 1, all the others 0. The different implementations are in
+   convserve.h Typically used are UNAVAILABLE for non SMP versions and
+   POSIX_THREADS_SMP for SMP versions. The others are used only in special
+   cases: UNIPROCESSOR in sim and uth, PTHREADS in origin,
+   and NT_THREADS in windows. */
+#define CMK_SHARED_VARS_UNAVAILABLE                        1 /* non SMP versions */
+#define CMK_SHARED_VARS_POSIX_THREADS_SMP                  0 /* SMP versions */
+#define CMK_SHARED_VARS_UNIPROCESSOR                       0
+#define CMK_SHARED_VARS_NT_THREADS                         0
+
+/* the following define if signal handlers should be used, both equal to zero
+   means that signals will not be used. only one of the following can be 1, the
+   other must be 0. they differ in the fact that the second (_WITH_RESTART)
+   enables retry on interrupt (a function is recalled upon interrupt and does
+   not return EINTR as in the first case) */
+#define CMK_SIGNAL_NOT_NEEDED                              1
+#define CMK_SIGNAL_USE_SIGACTION                           0
+#define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART              0
+
+/* decide which is the default implementation of the threads (see threads.c)
+   Only one of the following can be 1. If none of them is selected, qthreads
+   will be used as default. This default can be overwritten at compile time
+   using -DCMK_THREADS_BUILD_"type"=1 */
+#define CMK_THREADS_USE_CONTEXT                            0
+#define CMK_THREADS_USE_JCONTEXT                           0
+#define CMK_THREADS_USE_PTHREADS                           0
+
+#define CMK_USE_SPINLOCK                                   1
+
+/* Specifies what kind of timer to use, and the correspondent headers will be
+   included in convcore.c. If none is selected, then the machine.c file needs to
+   implement the timer primitives. */
+#define CMK_TIMER_USE_RTC                                  0
+#define CMK_TIMER_USE_RDTSC                                0
+#define CMK_TIMER_USE_GETRUSAGE                            0
+#define CMK_TIMER_USE_SPECIAL                              1
+#define CMK_TIMER_USE_TIMES                                0
+#define CMK_TIMER_USE_BLUEGENEL                            0
+
+
+/* In order to have a type with a fixed length across machines, these define the
+   different size integers, unsigned integers, and floats as the machine
+   specific types corresponding to the given sizes (2, 4, 8 bytes) */
+#define CMK_TYPEDEF_INT2 short
+#define CMK_TYPEDEF_INT4 int
+#define CMK_TYPEDEF_INT8 long
+#define CMK_TYPEDEF_UINT2 unsigned short
+#define CMK_TYPEDEF_UINT4 unsigned int
+#define CMK_TYPEDEF_UINT8 unsigned long long
+#define CMK_TYPEDEF_FLOAT4 float
+#define CMK_TYPEDEF_FLOAT8 double
+
+/* Specifies what the processor will do when it is idle, either sleep (1) or go
+   into busy waiting mode (0). In convcore.c there are a few files included if
+   sleeping mode, but the real distinct implementation is in the machine.c
+   file. */
+#define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT                   1
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP                     0
+
+/* specifies weather there is a web server collecting utilization statistics (1)
+   or not (0) */
+#define CMK_WEB_MODE                                       1
+
+#define CMK_DEBUG_MODE                                     0
+
+/* enables the load balancer framework. set to 1 for almost all the machines */
+#define CMK_LBDB_ON                                       1
+
+#define CMK_64BIT                                         1
+#define CMK_AMD64                                         1
+
+/* Other possible definitions:
+
+In fault tolerant architectures, CK_MEM_CHECKPOINT can be set. In this case the
+extended header must contain also another field called "pn" (phase number).
+
+*/
+
+#endif
diff --git a/src/arch/gni-crayxc/conv-mach.sh b/src/arch/gni-crayxc/conv-mach.sh
new file mode 100644 (file)
index 0000000..4e4c876
--- /dev/null
@@ -0,0 +1,57 @@
+GNI_CRAYXC=1
+PMI_CFLAGS=`pkg-config --cflags cray-pmi`
+PMI_LIBS=`pkg-config --libs cray-pmi`
+UGNI_CFLAGS=`pkg-config --cflags cray-ugni`
+UGNI_LIBS=`pkg-config --libs cray-ugni`
+
+PGCC=`CC -V 2>&1 | grep pgCC`
+ICPC=`CC -V 2>&1 | grep Intel`
+
+CMK_CPP_CHARM='/lib/cpp -P'
+CMK_CPP_C="cc -E"
+CMK_CC="cc $PMI_CFLAGS $UGNI_CFLAGS "
+CMK_CXX="CC $PMI_CFLAGS $UGNI_CFLAGS"
+CMK_CXXPP="$CMK_CXX -x c++ -E  "
+CMK_LD="eval $CMK_CC "
+CMK_LIBS='-lckqt'
+CMK_LD_LIBRARY_PATH="-rpath $CHARMLIBSO/ $PMI_LIBS $UGNI_LIBS"
+
+CMK_QT="generic64"
+
+# compiler for compiling sequential programs
+if test -n "$PGCC"
+then
+CMK_CC="$CMK_CC -DCMK_FIND_FIRST_OF_PREDICATE=1 "
+CMK_CXX="$CMK_CXX -DCMK_FIND_FIRST_OF_PREDICATE=1 --no_using_std "
+# gcc is needed for building QT
+CMK_SEQ_CC="gcc "
+CMK_SEQ_CXX="pgCC  --no_using_std "
+elif test -n "$ICPC"
+then
+CMK_SEQ_CC="cc -fPIC "
+CMK_SEQ_CXX="CC -fPIC "
+else
+CMK_SEQ_CC="gcc "
+CMK_SEQ_CXX="g++ "
+fi
+CMK_SEQ_LD="$CMK_SEQ_CC "
+CMK_SEQ_LDXX="$CMK_SEQ_CXX "
+CMK_SEQ_LIBS=""
+
+# compiler for native programs
+CMK_NATIVE_CC="gcc "
+CMK_NATIVE_LD="gcc "
+CMK_NATIVE_CXX="g++ "
+CMK_NATIVE_LDXX="g++ "
+CMK_NATIVE_LIBS=""
+
+CMK_RANLIB="ranlib"
+
+# for F90 compiler
+CMK_CF77="ftn "
+CMK_CF90="ftn "
+CMK_F90LIBS=""
+CMK_F90_USE_MODDIR=1
+CMK_F90_MODINC="-I"
+CMK_MOD_EXT="mod"
+
diff --git a/src/arch/gni/Makefile.machine b/src/arch/gni/Makefile.machine
new file mode 100644 (file)
index 0000000..699aa9a
--- /dev/null
@@ -0,0 +1,5 @@
+$(L)/libconv-cplus-n.a: machine.h machine.c machine-common-core.c machine-broadcast.c machine-lrts.h machine-pxshm.c machine-xpmem.c machine-persistent.c machine-commthd-util.c machine-smp.c pcqueue.h  $(L)/cray_tlbhack.o
+
+$(L)/cray_tlbhack.o: cray_tlbhack.c
+       $(CHARMC) -o $@ cray_tlbhack.c
+
diff --git a/src/arch/gni/README b/src/arch/gni/README
new file mode 100644 (file)
index 0000000..b2106a2
--- /dev/null
@@ -0,0 +1,76 @@
+This directory contains a template machine layer, with a description of what
+each file should contain.
+
+In order to create a new machine layer, create a new directory with the name of
+the architecture. Copy the four files contained in this directory, and edit them
+to implement the desired functions. To implement all the functions other files
+may be created, but being careful that no one collide with anyone already in
+charm.
+
+Instead, in order to add a new suboption to an existing machine, for example
+adding a new option "smp" to the architecture "elan", create two new files
+inside the directory elan called "conv-mach-smp.h" and "conv-mach-smp.sh" with
+the same meaning as the two conv-mach files. In this case, the normal
+"conv-mach.h" file will be included into the code, followed by the specific
+"conv-mach-smp.h" file. Thus, this second file need to take care of the
+definitions already made by the former one.
+
+The decision whether to implement a fully new machine.c, or base to an already
+existing one, should be made primarly on who will start the job in the target
+machine. For example the program directly (charmrun in net- versions), mpirun in
+mpi- versions, qsub, psub of any other scheduler. In particular, if it is
+required that the program itself starts the job, then net- version is a good
+option.
+
+The files contain the following information:
+
+* conv-mach.sh
+
+       contains the definitions of the compailers available in the building
+system, and the additional flags and options to pass to the commands. This file
+is read by the buils shell script.
+
+
+* conv-common.h
+
+       contains the common definitions of the architecture, specified as
+"define". The main options are described in the template. This file will be
+included into the code by conv-config.h which is itself included into converse.h
+(together with conv-mach.h and the other generated configurations).
+
+
+* conv-mach.h
+
+       this file is similar to conv-common.h as structure, but it allows to
+specify subarchitectures (like 'gm' in "./build charm++ net-linux gm"). In this
+way, a single conv-common.h will contain the common definitions for all the
+architectures, a single machine.c will contain the common code, and the flags in
+this file can select the code to be compiled.
+
+
+* machine.c
+
+       this file contains the implementation of the architecture in its
+totality, meaning that subarchitectures (specified with "conv-mach-option.h"
+files) will all link to this file, and it will need to have switches to
+selectively compile the code. These are tipically in the form of:
+ "#if CMK_OPTION_IN_CONV_MACH_H". the template file contains the description of
+the methods that should be implemented, also considering the values set into the
+two header files (conv-common/conv-mach).
+
+
+Other three files are needed during the charm build: conv-mach-pre.h,
+conv-mach-opt.h and conv-mach-opt.sh which are automatically generated by the
+build script with the properties retrieved from the system and the compile time
+options specified.
+
+
+*** A more detailed description on which files are linked into charm. ***
+
+When charm builds, the tmp directory is created and all source files are linked
+inside it. Regarding the arch files, only two directories are linked: the base
+directory and the architecture directory. With an example, if compiling
+"net-linux", the base directory will be "net" and the architecture directory
+"net-linux". This implies that the machine.c file present in the "net" directory
+will be linked (together will all the conv-mach* files and conv-common.h), but
+those present in "net-linux" will eventually overwrite them.
diff --git a/src/arch/gni/conv-common.h b/src/arch/gni/conv-common.h
new file mode 100644 (file)
index 0000000..43bf737
--- /dev/null
@@ -0,0 +1,63 @@
+
+#define CMK_USE_LRTS                                      1
+
+#define CMK_HAS_PARTITION                                  1
+
+#define CMK_CONVERSE_UGNI                                  1
+
+#define CMK_CMIDELIVERS_USE_COMMON_CODE                    1
+
+#define CMK_CMIPRINTF_IS_A_BUILTIN                         0
+
+#define CMI_MPI_TRACE_USEREVENTS                           0
+
+#define  CMK_DIRECT                                             0
+
+//#define  DELTA_COMPRESS                                     1
+
+#define CMK_HANDLE_SIGUSR                                  0
+
+#if DELTA_COMPRESS
+#if CMK_ERROR_CHECKING
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt2 seq; unsigned char cksum, magic; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root; CmiUInt4 compressStart; CmiUInt2 compress_flag,xxhdl; CmiUInt8 persistRecvHandler; 
+#else
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt4 seq; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root; CmiUInt4 compressStart; CmiUInt2 compress_flag,xxhdl; CmiUInt8 persistRecvHandler; 
+#endif
+#else 
+#if CMK_ERROR_CHECKING
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt2 seq; unsigned char cksum, magic; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root;  
+#else
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt4 seq; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root;  
+#endif
+#endif
+
+#define CMK_MSG_HEADER_BASIC  CMK_MSG_HEADER_EXT
+#define CMK_MSG_HEADER_EXT    { CMK_MSG_HEADER_EXT_ }
+#define CMK_MSG_HEADER_BIGSIM_    { CMK_MSG_HEADER_EXT_ CMK_BIGSIM_FIELDS }
+
+#define CMK_MULTICAST_GROUP_TYPE                struct { unsigned pe, id; }
+#define CMK_MULTICAST_DEF_USE_COMMON_CODE                  1
+#define CMK_MULTICAST_LIST_USE_COMMON_CODE                 1
+#define CMK_MULTICAST_GROUP_USE_COMMON_CODE                1
+
+#define CMK_RSH_IS_A_COMMAND                               0
+#define CMK_RSH_NOT_NEEDED                                 1
+#define CMK_RSH_USE_REMSH                                  0
+
+#define CMK_SPANTREE_MAXSPAN                               4
+#define CMK_SPANTREE_USE_COMMON_CODE                       1
+
+#define CMK_VECTOR_SEND_USES_COMMON_CODE                   1
+
+#define CMK_CCS_AVAILABLE                                  1
+
+#define NODE_0_IS_CONVHOST                                 1
+
+#define CMK_USE_OOB                                        0
+
+#define CMK_IMMEDIATE_MSG                                 1
+#define CMK_MACHINE_PROGRESS_DEFINED                       1
+
+#define CMK_LB_CPUTIMER                                           0
+
+
diff --git a/src/arch/gni/conv-common.sh b/src/arch/gni/conv-common.sh
new file mode 100644 (file)
index 0000000..5dd13cd
--- /dev/null
@@ -0,0 +1,3 @@
+
+CMK_BUILD_CRAY=1
+
diff --git a/src/arch/gni/conv-mach-hugepages.h b/src/arch/gni/conv-mach-hugepages.h
new file mode 100644 (file)
index 0000000..a71bdf7
--- /dev/null
@@ -0,0 +1 @@
+#define LARGEPAGE 1
diff --git a/src/arch/gni/conv-mach-hugepages.sh b/src/arch/gni/conv-mach-hugepages.sh
new file mode 100644 (file)
index 0000000..a6513c0
--- /dev/null
@@ -0,0 +1,9 @@
+##!/bin/sh -l
+
+# Check that some hugepages module is loaded
+if echo $LOADEDMODULES | grep -q craype-hugepages; then
+    true
+else
+    echo 'Must have a craype-hugepages module loaded (e.g. module load craype-hugepages8M)' >&2
+    exit 1
+fi
diff --git a/src/arch/gni/conv-mach-mlogft.h b/src/arch/gni/conv-mach-mlogft.h
new file mode 100644 (file)
index 0000000..b0e8305
--- /dev/null
@@ -0,0 +1,4 @@
+#define __FAULT__              1
+#define _FAULT_MLOG_           1
+#define CMK_CHARE_USE_PTR      1
+#define CMK_MESSAGE_LOGGING 1
diff --git a/src/arch/gni/conv-mach-mlogft.sh b/src/arch/gni/conv-mach-mlogft.sh
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/arch/gni/conv-mach-syncft.h b/src/arch/gni/conv-mach-syncft.h
new file mode 100644 (file)
index 0000000..5cb052b
--- /dev/null
@@ -0,0 +1,15 @@
+
+#undef CMK_MSG_HEADER_EXT_
+//#undef CMK_MSG_HEADER_EXT
+//#undef CMK_MSG_HEADER_BIGSIM_
+/* expand the header to store the restart phase counter(pn) */
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt4 seq; CmiUInt2 rank,hdl,xhdl,info,stratid,redID,pn,d9; CmiInt4 root; 
+//#define CMK_MSG_HEADER_EXT    { CMK_MSG_HEADER_EXT_ }
+//#define CMK_MSG_HEADER_BIGSIM_    { CmiUInt2 d0,d1,d2,d3,d4,d5,hdl,xhdl,pn,info; int nd, n; double rt; CmiInt2 tID; CmiUInt2 hID; char t; int msgID; int srcPe;}
+//#define CMK_MSG_HEADER_BIGSIM_  { CMK_MSG_HEADER_EXT_ CMK_BIGSIM_FIELDS }
+
+#define CmiGetRestartPhase(m)       ((((CmiMsgHeaderExt*)m)->pn))
+
+#define __FAULT__                                         1
+
+#define CMK_MEM_CHECKPOINT                                1
diff --git a/src/arch/gni/conv-mach-syncft.sh b/src/arch/gni/conv-mach-syncft.sh
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/arch/gni/cray_tlbhack.c b/src/arch/gni/cray_tlbhack.c
new file mode 100644 (file)
index 0000000..d19c23b
--- /dev/null
@@ -0,0 +1,6 @@
+#include <unistd.h>
+
+int gethugepagesize()
+{
+    return getpagesize();
+}
diff --git a/src/arch/gni/machine-cmidirect.c b/src/arch/gni/machine-cmidirect.c
new file mode 100644 (file)
index 0000000..d7d4b31
--- /dev/null
@@ -0,0 +1,255 @@
+/** @file
+ * uGNI cmiDirect communication
+ * @ingroup Machine
+*/
+
+/*
+  included in machine.c
+  Yanhua Sun, 2/5/2012
+*/
+
+#define     CMI_DIRECT_DEBUG    0
+#include "cmidirect.h"
+CmiDirectMemoryHandler CmiDirect_registerMemory(void *buff, int size)
+{
+    CmiDirectMemoryHandler mem_hndl; 
+    gni_return_t        status;
+    status = registerMessage(buff, size, 0, &mem_hndl); 
+    //MEMORY_REGISTER(onesided_hnd, nic_hndl, buff, size, &mem_hndl, &omdh, status);
+    GNI_RC_CHECK("cmidirect register memory fails\n", status);
+    return mem_hndl;
+}
+static void printHandle(CmiDirectUserHandle *userHandle, char *s)
+{
+    CmiPrintf( "[%d]%s(%p)(%p,%p,%p)==>(%p,%p,%p)(%d)(%p,%p)\n", CmiMyPe(), s, userHandle, userHandle->localBuf, userHandle->localMdh.qword1, userHandle->localMdh.qword2, 
+        userHandle->remoteBuf, userHandle->remoteMdh.qword1, userHandle->remoteMdh.qword2, userHandle->transSize, userHandle->callbackFnPtr, userHandle->callbackData );
+}
+
+struct infiDirectUserHandle CmiDirect_createHandle_mem(CmiDirectMemoryHandler *mem_hndl, void *recvBuf, int recvBufSize, void (*callbackFnPtr)(void *), void *callbackData)
+{
+    gni_return_t            status = GNI_RC_SUCCESS;
+    CmiDirectUserHandle userHandle;
+    userHandle.handle=1; 
+    userHandle.remoteNode= CmiMyNode();
+    userHandle.remoteRank = CmiMyRank();
+    userHandle.transSize=recvBufSize;
+    userHandle.remoteBuf=recvBuf;
+    userHandle.callbackFnPtr=callbackFnPtr;
+    userHandle.callbackData=callbackData;
+    userHandle.remoteMdh = *mem_hndl;
+    userHandle.initialValue=0;
+#if CMI_DIRECT_DEBUG
+    //printHandle(&userHandle, "Create Handler");
+#endif
+    return userHandle;
+
+}
+/**
+ To be called on the receiver to create a handle and return its number
+**/
+CmiDirectUserHandle CmiDirect_createHandle(int localNode,void *recvBuf, int recvBufSize, void (*callbackFnPtr)(void *), void *callbackData,double initialValue) {
+
+    gni_return_t            status = GNI_RC_SUCCESS;
+    CmiDirectUserHandle userHandle;
+    userHandle.handle=1; 
+    userHandle.localNode=localNode;
+    userHandle.remoteNode= CmiMyNode();
+    userHandle.transSize=recvBufSize;
+    userHandle.remoteBuf=recvBuf;
+    userHandle.initialValue=initialValue;
+    userHandle.callbackFnPtr=callbackFnPtr;
+    userHandle.callbackData=callbackData;
+    if(recvBufSize <= SMSG_MAX_MSG)
+    {
+        status = registerMessage(userHandle.remoteBuf, recvBufSize, 0, &userHandle.remoteMdh); 
+        //MEMORY_REGISTER(onesided_hnd, nic_hndl, userHandle.remoteBuf, recvBufSize, &(userHandle.remoteMdh), &omdh, status);
+    }
+    else if(IsMemHndlZero((GetMemHndl(userHandle.remoteBuf)))){
+        //status = registerMempool(userHandle.remoteBuf);
+        userHandle.remoteMdh = GetMemHndl(userHandle.remoteBuf);
+    } else
+        userHandle.remoteMdh = GetMemHndl(userHandle.remoteBuf);
+    if(status != GNI_RC_SUCCESS) {
+        userHandle.remoteMdh.qword1 = 0;
+        userHandle.remoteMdh.qword2 = 0;
+    }
+
+#if REMOTE_EVENT
+    userHandle.ack_index =  IndexPool_getslot(&ackPool, userHandle.remoteBuf, 1);
+#endif
+#if CMI_DIRECT_DEBUG
+    //printHandle(&userHandle, "Create Handler");
+#endif
+    return userHandle;
+}
+
+void CmiDirect_saveHandler(CmiDirectUserHandle* h, void *ptr)
+{
+    h->remoteHandler = ptr;
+}
+
+void CmiDirect_assocLocalBuffer_mem(CmiDirectUserHandle *userHandle, CmiDirectMemoryHandler *mem_hndl, void *sendBuf,int sendBufSize) {
+    gni_return_t            status = GNI_RC_SUCCESS;
+    
+    userHandle->localNode=CmiMyNode();
+    userHandle->localBuf=sendBuf;
+
+    userHandle->localMdh = *mem_hndl;
+#if CMI_DIRECT_DEBUG
+    printHandle(userHandle, "Associate Handler");
+#endif
+}
+/****
+ To be called on the local to attach the local's buffer to this handle
+******/
+
+void CmiDirect_assocLocalBuffer(CmiDirectUserHandle *userHandle,void *sendBuf,int sendBufSize) {
+
+    /* one-sided primitives would require registration of memory */
+    gni_return_t            status = GNI_RC_SUCCESS;
+    
+    userHandle->localNode=CmiMyNode();
+    userHandle->localBuf=sendBuf;
+
+    if(userHandle->transSize <= SMSG_MAX_MSG)
+    {
+        status = registerMessage(userHandle->localBuf, userHandle->transSize, 0, &(userHandle->localMdh)); 
+        //MEMORY_REGISTER(onesided_hnd, nic_hndl, userHandle->localBuf, userHandle->transSize, &userHandle->localMdh, &omdh, status);
+    }
+    else if(IsMemHndlZero((GetMemHndl(userHandle->localBuf)))){
+        //status = registerMempool(userHandle->localBuf);
+        userHandle->localMdh = GetMemHndl(userHandle->localBuf);
+    } else
+        userHandle->localMdh = GetMemHndl(userHandle->localBuf);
+   
+    if(status != GNI_RC_SUCCESS) {
+        userHandle->localMdh.qword1 = 0;
+        userHandle->localMdh.qword2 = 0;
+    }
+
+#if CMI_DIRECT_DEBUG
+    printHandle(userHandle, "Associate Handler");
+#endif
+}
+
+/****
+To be called on the local to do the actual data transfer
+******/
+void CmiDirect_put(CmiDirectUserHandle *userHandle) {
+
+    gni_post_descriptor_t *pd;
+
+#if USE_LRTS_MEMPOOL
+    if (userHandle->remoteNode== CmiMyNode()) {
+        CmiMemcpy(userHandle->remoteBuf,userHandle->localBuf,userHandle->transSize);
+        (*(userHandle->callbackFnPtr))(userHandle->callbackData);
+    } else {
+        gni_return_t status;
+        RDMA_REQUEST        *rdma_request_msg;
+        MallocPostDesc(pd);
+        if(userHandle->transSize <= LRTS_GNI_RDMA_THRESHOLD)
+            pd->type            = GNI_POST_FMA_PUT;
+        else
+            pd->type            = GNI_POST_RDMA_PUT;
+        pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
+        pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+        pd->length          = userHandle->transSize;
+        pd->local_addr      = (uint64_t) (userHandle->localBuf);
+        pd->local_mem_hndl  = userHandle->localMdh; 
+        pd->remote_addr     = (uint64_t)(userHandle->remoteBuf);
+        pd->remote_mem_hndl = userHandle->remoteMdh;
+        pd->src_cq_hndl     = 0;
+        pd->rdma_mode       = 0;
+        pd->first_operand   = (uint64_t)(userHandle->remoteHandler);
+        pd->amo_cmd         = 1;
+        pd->cqwrite_value   = DIRECT_SEQ;
+#if REMOTE_EVENT
+        bufferRdmaMsg(sendRdmaBuf, CmiGetNodeGlobal(userHandle->remoteNode,CmiMyPartition()), pd, userHandle->ack_index); 
+#else
+        bufferRdmaMsg(sendRdmaBuf, CmiGetNodeGlobal(userHandle->remoteNode,CmiMyPartition()), pd, -1); 
+#endif
+#if CMI_DIRECT_DEBUG
+        printHandle(userHandle, "After Direct_put");
+        CmiPrintf("[%d] RDMA put %d,%d bytes addr %p to remoteNode %d:%p \n\n",CmiMyPe(), userHandle->transSize, pd->length, (void*)(pd->local_addr), userHandle->remoteNode, (void*) (pd->remote_addr));
+#endif
+    }
+#else
+    CmiPrintf("Normal Send in CmiDirect Put\n");
+    CmiAbort("");
+#endif
+
+
+}
+
+// needs to figure out what is local/remote
+void CmiDirect_get(CmiDirectUserHandle *userHandle) {
+
+    gni_post_descriptor_t *pd;
+
+#if USE_LRTS_MEMPOOL
+    if (userHandle->remoteNode== CmiMyNode()) {
+        CmiMemcpy(userHandle->remoteBuf,userHandle->localBuf,userHandle->transSize);
+        (*(userHandle->callbackFnPtr))(userHandle->callbackData);
+    } else {
+        gni_return_t status;
+        RDMA_REQUEST        *rdma_request_msg;
+        MallocPostDesc(pd);
+        if(userHandle->transSize <= LRTS_GNI_RDMA_THRESHOLD)
+            pd->type            = GNI_POST_FMA_GET;
+        else
+            pd->type            = GNI_POST_RDMA_GET;
+        pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
+        pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+        pd->length          = userHandle->transSize;
+        pd->local_addr      = (uint64_t) (userHandle->localBuf);
+        pd->local_mem_hndl  = userHandle->localMdh; 
+        pd->remote_addr     = (uint64_t)(userHandle->remoteBuf);
+        pd->remote_mem_hndl = userHandle->remoteMdh;
+        pd->src_cq_hndl     = 0;
+        pd->rdma_mode       = 0;
+        pd->first_operand   = (uint64_t) (userHandle->callbackFnPtr);
+        pd->second_operand  = (uint64_t) (userHandle->callbackData);
+        pd->amo_cmd         = 2;
+        pd->cqwrite_value   = DIRECT_SEQ;
+#if REMOTE_EVENT
+        bufferRdmaMsg(sendRdmaBuf, CmiGetNodeGlobal(userHandle->remoteNode,CmiMyPartition()), pd, userHandle->ack_index); 
+#else
+        bufferRdmaMsg(sendRdmaBuf, CmiGetNodeGlobal(userHandle->remoteNode,CmiMyPartition()), pd, -1);
+#endif
+#if CMI_DIRECT_DEBUG
+    CmiPrintf("[%d] RDMA get %d,%d bytes addr %p to remoteNode %d:%p \n\n",CmiMyPe(), userHandle->transSize, pd->length, (void*)(pd->local_addr), userHandle->remoteNode, (void*) (pd->remote_addr));
+#endif
+    }
+#else
+    CmiPrintf("Normal Send in CmiDirect Get\n");
+    CmiAbort("");
+#endif
+
+
+
+}
+
+/**** up to the user to safely call this */
+void CmiDirect_deassocLocalBuffer(CmiDirectUserHandle *userHandle) {
+
+
+}
+
+/**** up to the user to safely call this */
+void CmiDirect_destroyHandle(CmiDirectUserHandle *userHandle) {
+    free(userHandle);
+}
+
+/**** Should not be called the first time *********/
+void CmiDirect_ready(CmiDirectUserHandle *userHandle) {
+}
+
+/**** Should not be called the first time *********/
+void CmiDirect_readyPollQ(CmiDirectUserHandle *userHandle) {
+}
+
+/**** Should not be called the first time *********/
+void CmiDirect_readyMark(CmiDirectUserHandle *userHandle) {
+}
+
diff --git a/src/arch/gni/machine-persistent.c b/src/arch/gni/machine-persistent.c
new file mode 100644 (file)
index 0000000..90957cd
--- /dev/null
@@ -0,0 +1,405 @@
+/** @file
+ * Elan persistent communication
+ * @ingroup Machine
+*/
+
+/*
+  included in machine.c
+  Gengbin Zheng, 9/6/2011
+*/
+
+/*
+  machine specific persistent comm functions:
+  * LrtsSendPersistentMsg
+  * CmiSyncSendPersistent
+  * PumpPersistent
+  * PerAlloc PerFree      // persistent message memory allocation/free functions
+  * persist_machine_init  // machine specific initialization call
+*/
+
+#if   PERSISTENT_GET_BASE
+void LrtsSendPersistentMsg(PersistentHandle h, int destNode, int size, void *msg)
+{
+    PersistentSendsTable *slot = (PersistentSendsTable *)h;
+    int         destIndex; 
+    uint8_t tag = LMSG_PERSISTENT_INIT_TAG;
+    SMSG_QUEUE *queue = &smsg_queue;
+
+    if (size > slot->sizeMax) {
+        CmiPrintf("size: %d sizeMax: %d mype=%d destPe=%d\n", size, slot->sizeMax, CmiMyPe(), destNode);
+        CmiAbort("Abort: Invalid size\n");
+    }
+
+    destIndex = slot->addrIndex;
+    if (slot->destBuf[destIndex].destAddress) {
+        slot->addrIndex = (destIndex+1)%PERSIST_BUFFERS_NUM;
+#if  DELTA_COMPRESS
+        if(slot->compressFlag)
+            size = ALIGN64(CompressPersistentMsg(h, size, msg));
+#endif
+        LrtsPrepareEnvelope(msg, size);
+        CONTROL_MSG *control_msg_tmp =  construct_control_msg(size, msg, -1);
+        control_msg_tmp -> dest_addr = (uint64_t)slot->destBuf[destIndex].destAddress;
+        control_msg_tmp -> dest_mem_hndl = slot->destBuf[destIndex].mem_hndl;
+        registerMessage(msg, size, 0, &(control_msg_tmp -> source_mem_hndl));
+        buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, tag);
+
+        MACHSTATE4(8, "[%d==%d]LrtsPersistent Sending %lld=>%lld\n", CmiMyNode(), destNode, msg, control_msg_tmp -> dest_addr);
+    }
+    else
+    {
+#if 1
+        if (slot->messageBuf != NULL) {
+            CmiPrintf("Unexpected message in buffer on %d\n", CmiMyPe());
+            CmiAbort("");
+        }
+        slot->messageBuf = msg;
+        slot->messageSize = size;
+#else
+        /* normal send */
+        PersistentHandle  *phs_tmp = phs;
+        int phsSize_tmp = phsSize;
+        phs = NULL; phsSize = 0;
+        CmiPrintf("[%d]Slot sending message directly\n", CmiMyPe());
+        CmiSyncSendAndFree(slot->destPE, size, msg);
+        phs = phs_tmp; phsSize = phsSize_tmp;
+#endif
+    }
+}
+#else
+void LrtsSendPersistentMsg(PersistentHandle h, int destNode, int size, void *m)
+{
+    gni_post_descriptor_t *pd;
+    gni_return_t status;
+    RDMA_REQUEST        *rdma_request_msg;
+    int         destIndex; 
+    PersistentSendsTable *slot = (PersistentSendsTable *)h;
+    if (h==NULL) {
+        printf("[%d] LrtsSendPersistentMsg: handle from node %d to node %d is NULL. \n", CmiMyPe(), myrank, destNode);
+        CmiAbort("LrtsSendPersistentMsg: not a valid PersistentHandle");
+    }
+    if (size > slot->sizeMax) {
+        CmiPrintf("size: %d sizeMax: %d mype=%d destPe=%d\n", size, slot->sizeMax, CmiMyPe(), destNode);
+        CmiAbort("Abort: Invalid size\n");
+    }
+
+    destIndex = slot->addrIndex;
+    if (slot->destBuf[destIndex].destAddress) {
+         //CmiPrintf("[%d===%d] LrtsSendPersistentMsg h=%p hdl=%d destNode=%d destAddress=%p size=%d\n", CmiMyPe(), destNode, h, CmiGetHandler(m), destNode, slot->destBuf[0].destAddress, size);
+
+        // uGNI part
+    
+        slot->addrIndex = (destIndex+1)%PERSIST_BUFFERS_NUM;
+#if  DELTA_COMPRESS
+        if(slot->compressFlag)
+            size = ALIGN64(CompressPersistentMsg(h, size, m));
+#endif
+        MallocPostDesc(pd);
+        if(size <= LRTS_GNI_RDMA_THRESHOLD) {
+            pd->type            = GNI_POST_FMA_PUT;
+        }
+        else
+        {
+            pd->type            = GNI_POST_RDMA_PUT;
+        }
+        pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
+        pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+        pd->length          = ALIGN64(size);
+        pd->local_addr      = (uint64_t) m;
+       
+        pd->remote_addr     = (uint64_t)slot->destBuf[destIndex].destAddress;
+        pd->remote_mem_hndl = slot->destBuf[destIndex].mem_hndl;
+#if MULTI_THREAD_SEND
+        pd->src_cq_hndl     = rdma_tx_cqh;
+#else
+        pd->src_cq_hndl     = 0;
+#endif
+        pd->rdma_mode       = 0;
+        pd->cqwrite_value   = PERSIST_SEQ;
+        pd->amo_cmd         = 0;
+
+#if CMK_WITH_STATS 
+        pd->sync_flag_addr = 1000000 * CmiWallTimer(); //microsecond
+#endif
+        SetMemHndlZero(pd->local_mem_hndl);
+
+        //CmiPrintf("[%d] sending   %p  with handler=%p\n", CmiMyPe(), m, ((CmiMsgHeaderExt*)m)-> persistRecvHandler);
+        //TRACE_COMM_CREATION(CpvAccess(projTraceStart), (void*)pd->local_addr);
+         /* always buffer */
+#if CMK_SMP || 1
+#if REMOTE_EVENT
+        bufferRdmaMsg(sendHighPriorBuf, destNode, pd, (int)(size_t)(slot->destHandle));
+#else
+        bufferRdmaMsg(sendHighPriorBuf, destNode, pd, -1);
+#endif
+
+#else                      /* non smp */
+
+#if REMOTE_EVENT
+        pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+        int sts = GNI_EpSetEventData(ep_hndl_array[destNode], destNode, PERSIST_EVENT((int)(size_t)(slot->destHandle)));
+        GNI_RC_CHECK("GNI_EpSetEventData", sts);
+#endif
+        status = registerMessage((void*)(pd->local_addr), pd->length, pd->cqwrite_value, &pd->local_mem_hndl);
+        if (status == GNI_RC_SUCCESS) 
+        {
+#if CMK_WITH_STATS
+            RDMA_TRY_SEND(pd->type)
+#endif
+            if(pd->type == GNI_POST_RDMA_PUT) 
+                status = GNI_PostRdma(ep_hndl_array[destNode], pd);
+            else
+                status = GNI_PostFma(ep_hndl_array[destNode],  pd);
+        }
+        else
+            status = GNI_RC_ERROR_RESOURCE;
+        if(status == GNI_RC_ERROR_RESOURCE|| status == GNI_RC_ERROR_NOMEM )
+        {
+#if REMOTE_EVENT
+            bufferRdmaMsg(sendRdmaBuf, destNode, pd, (int)(size_t)(slot->destHandle));
+#else
+            bufferRdmaMsg(sendRdmaBuf, destNode, pd, -1);
+#endif
+        }
+        else {
+            GNI_RC_CHECK("AFter posting", status);
+#if  CMK_WITH_STATS
+            pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+            RDMA_TRANS_INIT(pd->type, pd->sync_flag_addr/1000000.0)
+#endif
+        }
+#endif
+  }
+  else {
+#if 1
+    if (slot->messageBuf != NULL) {
+      CmiPrintf("Unexpected message in buffer on %d\n", CmiMyPe());
+      CmiAbort("");
+    }
+    slot->messageBuf = m;
+    slot->messageSize = size;
+#else
+    /* normal send */
+    PersistentHandle  *phs_tmp = phs;
+    int phsSize_tmp = phsSize;
+    phs = NULL; phsSize = 0;
+    CmiPrintf("[%d]Slot sending message directly\n", CmiMyPe());
+    CmiSyncSendAndFree(slot->destPE, size, m);
+    phs = phs_tmp; phsSize = phsSize_tmp;
+#endif
+  }
+}
+
+#endif
+
+#if 0
+void CmiSyncSendPersistent(int destPE, int size, char *msg, PersistentHandle h)
+{
+  CmiState cs = CmiGetState();
+  char *dupmsg = (char *) CmiAlloc(size);
+  memcpy(dupmsg, msg, size);
+
+  /*  CmiPrintf("Setting root to %d\n", 0); */
+  CMI_SET_BROADCAST_ROOT(dupmsg, 0);
+
+  if (cs->pe==destPE) {
+    CQdCreate(CpvAccess(cQdState), 1);
+    CdsFifo_Enqueue(CpvAccess(CmiLocalQueue),dupmsg);
+  }
+  else
+    LrtsSendPersistentMsg(h, destPE, size, dupmsg);
+}
+#endif
+
+extern void CmiReference(void *blk);
+
+#if 0
+
+/* called in PumpMsgs */
+int PumpPersistent()
+{
+  int status = 0;
+  PersistentReceivesTable *slot = persistentReceivesTableHead;
+  while (slot) {
+    char *msg = slot->messagePtr[0];
+    int size = *(slot->recvSizePtr[0]);
+    if (size)
+    {
+      int *footer = (int*)(msg + size);
+      if (footer[0] == size && footer[1] == 1) {
+/*CmiPrintf("[%d] PumpPersistent messagePtr=%p size:%d\n", CmiMyPe(), slot->messagePtr, size);*/
+
+#if 0
+      void *dupmsg;
+      dupmsg = CmiAlloc(size);
+                                                                                
+      _MEMCHECK(dupmsg);
+      memcpy(dupmsg, msg, size);
+      memset(msg, 0, size+2*sizeof(int));
+      msg = dupmsg;
+#else
+      /* return messagePtr directly and user MUST make sure not to delete it. */
+      /*CmiPrintf("[%d] %p size:%d rank:%d root:%d\n", CmiMyPe(), msg, size, CMI_DEST_RANK(msg), CMI_BROADCAST_ROOT(msg));*/
+
+      CmiReference(msg);
+      swapRecvSlotBuffers(slot);
+#endif
+
+      CmiPushPE(CMI_DEST_RANK(msg), msg);
+#if CMK_BROADCAST_SPANNING_TREE
+      if (CMI_BROADCAST_ROOT(msg))
+          SendSpanningChildren(size, msg);
+#endif
+      /* clear footer after message used */
+      *(slot->recvSizePtr[0]) = 0;
+      footer[0] = footer[1] = 0;
+
+#if 0
+      /* not safe at all! */
+      /* instead of clear before use, do it earlier */
+      msg=slot->messagePtr[0];
+      size = *(slot->recvSizePtr[0]);
+      footer = (int*)(msg + size);
+      *(slot->recvSizePtr[0]) = 0;
+      footer[0] = footer[1] = 0;
+#endif
+      status = 1;
+      }
+    }
+    slot = slot->next;
+  }
+  return status;
+}
+
+#endif
+
+#if ! LARGEPAGE
+#error "Persistent communication must be compiled with LARGEPAGE on"
+#endif
+
+void *PerAlloc(int size)
+{
+#if CMK_PERSISTENT_COMM_PUT
+//  return CmiAlloc(size);
+  gni_return_t status;
+  void *res = NULL;
+  char *ptr;
+  size = ALIGN64(size + sizeof(CmiChunkHeader));
+  //printf("[%d] PerAlloc %p %p %d. \n", myrank, res, ptr, size);
+  res = mempool_malloc(CpvAccess(persistent_mempool), ALIGNBUF+size-sizeof(mempool_header), 1);
+  if (res) ptr = (char*)res - sizeof(mempool_header) + ALIGNBUF;
+  SIZEFIELD(ptr)=size;
+  REFFIELD(ptr)= PERSIST_SEQ;
+  return ptr;
+#else
+  char *ptr = CmiAlloc(size);
+  return ptr;
+#endif
+}
+                                                                                
+void PerFree(char *msg)
+{
+#if CMK_SMP
+  mempool_free_thread((char*)msg - ALIGNBUF + sizeof(mempool_header));
+#else
+  mempool_free(CpvAccess(persistent_mempool), (char*)msg - ALIGNBUF + sizeof(mempool_header));
+#endif
+}
+
+/* machine dependent init call */
+void persist_machine_init(void)
+{
+}
+
+void initSendSlot(PersistentSendsTable *slot)
+{
+  int i;
+  slot->destPE = -1;
+  slot->sizeMax = 0;
+  slot->destHandle = 0; 
+#if 0
+  for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
+    slot->destAddress[i] = NULL;
+    slot->destSizeAddress[i] = NULL;
+  }
+#endif
+  memset(&slot->destBuf, 0, sizeof(PersistentBuf)*PERSIST_BUFFERS_NUM);
+  slot->messageBuf = 0;
+  slot->messageSize = 0;
+  slot->prev = slot->next = NULL;
+}
+
+void initRecvSlot(PersistentReceivesTable *slot)
+{
+  int i;
+#if 0
+  for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
+    slot->messagePtr[i] = NULL;
+    slot->recvSizePtr[i] = NULL;
+  }
+#endif
+  memset(&slot->destBuf, 0, sizeof(PersistentBuf)*PERSIST_BUFFERS_NUM);
+  slot->sizeMax = 0;
+  slot->index = -1;
+  slot->prev = slot->next = NULL;
+}
+
+void setupRecvSlot(PersistentReceivesTable *slot, int maxBytes)
+{
+  int i;
+  for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
+      char *buf = PerAlloc(maxBytes+sizeof(int)*2);
+      _MEMCHECK(buf);
+      memset(buf, 0, maxBytes+sizeof(int)*2);
+      /* used large page and from mempool, memory always registered */
+    slot->destBuf[i].mem_hndl = GetMemHndl(buf);
+    slot->destBuf[i].destAddress = buf;
+      /* note: assume first integer in elan converse header is the msg size */
+    slot->destBuf[i].destSizeAddress = (unsigned int*)buf;
+    memset(buf, 0, maxBytes+sizeof(int)*2);
+  }
+  slot->sizeMax = maxBytes;
+  slot->addrIndex = 0;
+#if CMK_PERSISTENT_COMM_PUT
+#if REMOTE_EVENT
+#if !MULTI_THREAD_SEND
+  CmiLock(persistPool.lock);    /* lock in function */
+#endif
+  slot->index = IndexPool_getslot(&persistPool, slot, 2);
+#if !MULTI_THREAD_SEND
+  CmiUnlock(persistPool.lock);
+#endif
+#endif
+#endif
+}
+
+void clearRecvSlot(PersistentReceivesTable *slot)
+{
+#if CMK_PERSISTENT_COMM_PUT
+#if REMOTE_EVENT
+#if !MULTI_THREAD_SEND
+  CmiLock(persistPool.lock);
+#endif
+  IndexPool_freeslot(&persistPool, slot->index);
+#if !MULTI_THREAD_SEND
+  CmiUnlock(persistPool.lock);
+#endif
+#endif
+#endif
+}
+
+PersistentHandle getPersistentHandle(PersistentHandle h, int toindex)
+{
+#if REMOTE_EVENT
+  if (toindex)
+    return (PersistentHandle)(((PersistentReceivesTable*)h)->index);
+  else {
+    CmiLock(persistPool.lock);
+    PersistentHandle ret = (PersistentHandle)GetIndexAddress(persistPool, (int)(size_t)h);
+    CmiUnlock(persistPool.lock);
+    return ret;
+  }
+#else
+  return h;
+#endif
+}
diff --git a/src/arch/gni/machine-persistent.h b/src/arch/gni/machine-persistent.h
new file mode 100644 (file)
index 0000000..218d72b
--- /dev/null
@@ -0,0 +1,85 @@
+/** @file
+ * General implementation of persistent communication support
+ * @ingroup Machine
+ */
+
+/**
+ * \addtogroup Machine
+*/
+/*@{*/
+
+#include "gni_pub.h"
+#define PERSIST_MIN_SIZE                SMSG_MAX_MSG
+//#define COPY_HISTORY                          1
+// one is for receive one is to store the previous msg
+#if DELTA_COMPRESS
+#if COPY_HISTORY 
+#define PERSIST_BUFFERS_NUM             1
+#else
+#define PERSIST_BUFFERS_NUM             2
+#endif
+#else
+#define PERSIST_BUFFERS_NUM             1
+#endif
+
+#define PERSIST_SEQ                     0xFFFFFFF
+
+#define IS_PERSISTENT_MEMORY(ptr)          (REFFIELD(msg) > PERSIST_SEQ/2)
+
+typedef struct  _PersistentBuf {
+  void *destAddress;
+  void *destSizeAddress;
+  gni_mem_handle_t    mem_hndl;
+} PersistentBuf;
+
+typedef struct _PersistentSendsTable {
+  int destPE;
+  int sizeMax;
+  PersistentHandle   destHandle; 
+  PersistentBuf     destBuf[PERSIST_BUFFERS_NUM];
+  void *messageBuf;
+  int messageSize;
+  struct _PersistentSendsTable *prev, *next;
+#if DELTA_COMPRESS
+  PersistentHandle destDataHandle;
+  void  *previousMsg;
+  int   previousSize;
+  int   compressStart;
+  int   compressSize;
+  int  dataType;
+  int   compressFlag;
+#endif
+  int addrIndex;
+} PersistentSendsTable;
+
+typedef struct _PersistentReceivesTable {
+  PersistentBuf     destBuf[PERSIST_BUFFERS_NUM];
+  int sizeMax;
+  size_t               index;
+  struct _PersistentReceivesTable *prev, *next;
+  int           addrIndex;
+#if DELTA_COMPRESS
+  int   compressStart;
+  int  dataType;
+  void  *history;
+#endif
+} PersistentReceivesTable;
+
+CpvExtern(PersistentReceivesTable *, persistentReceivesTableHead);
+CpvExtern(PersistentReceivesTable *, persistentReceivesTableTail);
+
+CpvExtern(PersistentHandle *, phs);
+CpvExtern(int, phsSize);
+CpvExtern(int, curphs);
+
+PersistentHandle getPersistentHandle(PersistentHandle h, int toindex);
+void *PerAlloc(int size);
+void PerFree(char *msg);
+int PumpPersistent();
+void swapSendSlotBuffers(PersistentSendsTable *slot);
+void swapRecvSlotBuffers(PersistentReceivesTable *slot);
+void setupRecvSlot(PersistentReceivesTable *slot, int maxBytes);
+void clearRecvSlot(PersistentReceivesTable *slot);
+
+/*@}*/
+
diff --git a/src/arch/gni/machine.c b/src/arch/gni/machine.c
new file mode 100644 (file)
index 0000000..652bb89
--- /dev/null
@@ -0,0 +1,4242 @@
+
+/** @file
+ * GNI machine layer
+ *
+ * Author:   Yanhua Sun
+             Gengbin Zheng
+ * Date:   07-01-2011
+ *
+ *  Flow control by mem pool using environment variables:
+
+    # CHARM_UGNI_MEMPOOL_MAX can be maximum_register_mem/number_of_processes
+    # CHARM_UGNI_SEND_MAX can be half of CHARM_UGNI_MEMPOOL_MAX
+    export CHARM_UGNI_MEMPOOL_INIT_SIZE=8M
+    export CHARM_UGNI_MEMPOOL_MAX=20M
+    export CHARM_UGNI_SEND_MAX=10M
+
+    # limit on total mempool size allocated, this is to prevent mempool
+    # uses too much memory
+    export CHARM_UGNI_MEMPOOL_SIZE_LIMIT=512M 
+
+    other environment variables:
+
+    export CHARM_UGNI_NO_DEADLOCK_CHECK=yes    # disable checking deadlock
+    export CHARM_UGNI_MAX_MEMORY_ON_NODE=0.8G  # max memory per node for mempool
+    export CHARM_UGNI_BIG_MSG_SIZE=4M          # set big message size protocol
+    export CHARM_UGNI_BIG_MSG_PIPELINE_LEN=4   # set big message pipe len
+    export CHARM_UGNI_RDMA_MAX=100             # max pending RDMA operations
+ */
+/*@{*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/dir.h>
+#include <sys/stat.h>
+#include <gni_pub.h>
+#include <pmi.h>
+//#include <numatoolkit.h>
+
+#include "converse.h"
+
+#if CMK_DIRECT
+#define DIRECT_SEQ 0xFFFFFFE 
+#include "cmidirect.h"
+#endif
+
+#if !defined(LARGEPAGE)
+#define     LARGEPAGE              0
+#endif
+
+#if CMK_SMP
+#define MULTI_THREAD_SEND          0
+#define COMM_THREAD_SEND           (!MULTI_THREAD_SEND)
+#endif
+
+#if MULTI_THREAD_SEND
+#define CMK_WORKER_SINGLE_TASK     1
+#endif
+
+#define REMOTE_EVENT               1
+#define CQWRITE                    0
+
+#define CMI_EXERT_SEND_LARGE_CAP   0
+#define CMI_EXERT_RECV_RDMA_CAP    0
+
+
+#define CMI_SENDBUFFERSMSG_CAP            0
+#define CMI_PUMPNETWORKSMSG_CAP           0
+#define CMI_PUMPREMOTETRANSACTIONS_CAP    0
+#define CMI_PUMPLOCALTRANSACTIONS_CAP     0
+
+#if CMI_SENDBUFFERSMSG_CAP
+int     SendBufferMsg_cap  = 20;
+#endif
+
+#if CMI_PUMPNETWORKSMSG_CAP
+int     PumpNetworkSmsg_cap = 20;
+#endif
+
+#if CMI_PUMPREMOTETRANSACTIONS_CAP
+int     PumpRemoteTransactions_cap = 20;
+#endif
+
+#if CMI_PUMPREMOTETRANSACTIONS_CAP
+int     PumpLocalTransactions_cap = 15;
+#endif
+
+#if CMI_EXERT_SEND_LARGE_CAP
+static int SEND_large_cap = 20;
+static int SEND_large_pending = 0;
+#endif
+
+#if CMI_EXERT_RECV_RDMA_CAP
+static int   RDMA_cap =   10;
+static int   RDMA_pending = 0;
+#endif
+
+#define USE_LRTS_MEMPOOL                  1
+
+#define PRINT_SYH                         0
+
+// Trace communication thread
+#if CMK_TRACE_ENABLED && CMK_SMP_TRACE_COMMTHREAD
+#define TRACE_THRESHOLD     0.00001
+#define CMI_MPI_TRACE_MOREDETAILED 0
+#undef CMI_MPI_TRACE_USEREVENTS
+#define CMI_MPI_TRACE_USEREVENTS 1
+#else
+#undef CMK_SMP_TRACE_COMMTHREAD
+#define CMK_SMP_TRACE_COMMTHREAD 0
+#endif
+
+#define CMK_TRACE_COMMOVERHEAD 0
+#if CMK_TRACE_ENABLED && CMK_TRACE_COMMOVERHEAD
+#undef CMI_MPI_TRACE_USEREVENTS
+#define CMI_MPI_TRACE_USEREVENTS 1
+#else
+#undef CMK_TRACE_COMMOVERHEAD
+#define CMK_TRACE_COMMOVERHEAD 0
+#endif
+
+#if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && ! CMK_TRACE_IN_CHARM
+CpvStaticDeclare(double, projTraceStart);
+#define  START_EVENT()  CpvAccess(projTraceStart) = CmiWallTimer();
+#define  END_EVENT(x)   traceUserBracketEvent(x, CpvAccess(projTraceStart), CmiWallTimer());
+#define  EVENT_TIME()   CpvAccess(projTraceStart)
+#else
+#define  START_EVENT()
+#define  END_EVENT(x)
+#define  EVENT_TIME()   (0.0)
+#endif
+
+#if USE_LRTS_MEMPOOL
+
+#define oneMB (1024ll*1024)
+#define oneGB (1024ll*1024*1024)
+
+static CmiInt8 _mempool_size = 8*oneMB;
+static CmiInt8 _expand_mem =  4*oneMB;
+static CmiInt8 _mempool_size_limit = 0;
+
+static CmiInt8 _totalmem = 0.8*oneGB;
+
+#if LARGEPAGE
+static CmiInt8 BIG_MSG  =  16*oneMB;
+static CmiInt8 ONE_SEG  =  4*oneMB;
+#else
+static CmiInt8 BIG_MSG  =  4*oneMB;
+static CmiInt8 ONE_SEG  =  2*oneMB;
+#endif
+#if MULTI_THREAD_SEND
+static int BIG_MSG_PIPELINE = 1;
+#else
+static int BIG_MSG_PIPELINE = 4;
+#endif
+
+// dynamic flow control
+static CmiInt8 buffered_send_msg = 0;
+static CmiInt8 register_memory_size = 0;
+
+#if LARGEPAGE
+static CmiInt8  MAX_BUFF_SEND  =  100000*oneMB;
+static CmiInt8  MAX_REG_MEM    =  200000*oneMB;
+static CmiInt8 register_count = 0;
+#else
+#if CMK_SMP && COMM_THREAD_SEND 
+static CmiInt8  MAX_BUFF_SEND  =  100*oneMB;
+static CmiInt8  MAX_REG_MEM    =  200*oneMB;
+#else
+static CmiInt8  MAX_BUFF_SEND  =  16*oneMB;
+static CmiInt8  MAX_REG_MEM    =  25*oneMB;
+#endif
+
+
+#endif
+
+#endif     /* end USE_LRTS_MEMPOOL */
+
+#if MULTI_THREAD_SEND 
+#define     CMI_GNI_LOCK(x)       CmiLock(x);
+#define     CMI_GNI_TRYLOCK(x)       CmiTryLock(x)
+#define     CMI_GNI_UNLOCK(x)        CmiUnlock(x);
+#define     CMI_PCQUEUEPOP_LOCK(Q)   CmiLock((Q)->lock);
+#define     CMI_PCQUEUEPOP_UNLOCK(Q)    CmiUnlock((Q)->lock);
+#else
+#define     CMI_GNI_LOCK(x)
+#define     CMI_GNI_TRYLOCK(x)         (0)
+#define     CMI_GNI_UNLOCK(x)
+#define     CMI_PCQUEUEPOP_LOCK(Q)   
+#define     CMI_PCQUEUEPOP_UNLOCK(Q)
+#endif
+
+static int _tlbpagesize = 4096;
+
+//static int _smpd_count  = 0;
+
+static int   user_set_flag  = 0;
+
+static int _checkProgress = 1;             /* check deadlock */
+static int _detected_hang = 0;
+
+#define             SMSG_ATTR_SIZE      sizeof(gni_smsg_attr_t)
+
+// dynamic SMSG
+static int useDynamicSMSG = 0;               /* dynamic smsgs setup */
+
+static int avg_smsg_connection = 32;
+static int                 *smsg_connected_flag= 0;
+static gni_smsg_attr_t     **smsg_attr_vector_local;
+static gni_smsg_attr_t     **smsg_attr_vector_remote;
+static gni_ep_handle_t     ep_hndl_unbound;
+static gni_smsg_attr_t     send_smsg_attr;
+static gni_smsg_attr_t     recv_smsg_attr;
+
+typedef struct _dynamic_smsg_mailbox{
+   void     *mailbox_base;
+   int      size;
+   int      offset;
+   gni_mem_handle_t  mem_hndl;
+   struct      _dynamic_smsg_mailbox  *next;
+}dynamic_smsg_mailbox_t;
+
+static dynamic_smsg_mailbox_t  *mailbox_list;
+
+static CmiUInt8  smsg_send_count = 0,  last_smsg_send_count = 0;
+static CmiUInt8  smsg_recv_count = 0,  last_smsg_recv_count = 0;
+
+#if PRINT_SYH
+int         lrts_send_msg_id = 0;
+int         lrts_local_done_msg = 0;
+int         lrts_send_rdma_success = 0;
+#endif
+
+#include "machine.h"
+
+#include "pcqueue.h"
+
+#include "mempool.h"
+
+#if CMK_PERSISTENT_COMM
+#define PERSISTENT_GET_BASE 0 
+#if !PERSISTENT_GET_BASE
+#define CMK_PERSISTENT_COMM_PUT 1 
+#endif
+#include "machine-persistent.h"
+#define  POST_HIGHPRIORITY_RDMA    STATS_SENDRDMAMSG_TIME(SendRdmaMsg(sendHighPriorBuf));
+#else  
+#define  POST_HIGHPRIORITY_RDMA   
+#endif
+
+#if REMOTE_EVENT && (CMK_USE_OOB || CMK_PERSISTENT_COMM_PUT) 
+#define  PUMP_REMOTE_HIGHPRIORITY    STATS_PUMPREMOTETRANSACTIONS_TIME(PumpRemoteTransactions(highpriority_rx_cqh) );
+#else
+#define  PUMP_REMOTE_HIGHPRIORITY
+#endif
+
+//#define  USE_ONESIDED 1
+#ifdef USE_ONESIDED
+//onesided implementation is wrong, since no place to restore omdh
+#include "onesided.h"
+onesided_hnd_t   onesided_hnd;
+onesided_md_t    omdh;
+#define MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh)  omdh. onesided_mem_register(handler, (uint64_t)msg, size, 0, myomdh) 
+
+#define MEMORY_DEREGISTER(handler, nic_hndl, mem_hndl, myomdh) onesided_mem_deregister(handler, myomdh)
+
+#else
+uint8_t   onesided_hnd, omdh;
+
+#if REMOTE_EVENT || CQWRITE 
+#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdhh, cqh, status) \
+    if(register_memory_size+size>= MAX_REG_MEM) { \
+        status = GNI_RC_ERROR_NOMEM;} \
+    else {status = GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, cqh,  GNI_MEM_READWRITE, -1, mem_hndl); \
+        if(status == GNI_RC_SUCCESS) register_memory_size += size; }  
+#else
+#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh, cqh, status ) \
+        if (register_memory_size + size >= MAX_REG_MEM) { \
+            status = GNI_RC_ERROR_NOMEM; \
+        } else { status = GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, NULL,  GNI_MEM_READWRITE, -1, mem_hndl); \
+            if(status == GNI_RC_SUCCESS) register_memory_size += size; } 
+#endif
+
+#define  MEMORY_DEREGISTER(handler, nic_hndl, mem_hndl, myomdh, size)  \
+    do { if (GNI_MemDeregister(nic_hndl, (mem_hndl) ) == GNI_RC_SUCCESS) \
+             register_memory_size -= size; \
+         else CmiAbort("MEM_DEregister");  \
+    } while (0)
+#endif
+
+#define   GetMempoolBlockPtr(x)   MEMPOOL_GetBlockPtr(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   GetMempoolPtr(x)        MEMPOOL_GetMempoolPtr(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   GetMempoolsize(x)       MEMPOOL_GetSize(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   GetMemHndl(x)           MEMPOOL_GetMemHndl(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   IncreaseMsgInRecv(x)    MEMPOOL_IncMsgInRecv(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   DecreaseMsgInRecv(x)    MEMPOOL_DecMsgInRecv(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   IncreaseMsgInSend(x)    MEMPOOL_IncMsgInSend(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   DecreaseMsgInSend(x)    MEMPOOL_DecMsgInSend(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   NoMsgInSend(x)          MEMPOOL_GetMsgInSend(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) == 0
+#define   NoMsgInRecv(x)          MEMPOOL_GetMsgInRecv(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) == 0
+#define   NoMsgInFlight(x)        (NoMsgInSend(x) && NoMsgInRecv(x))
+#define   IsMemHndlZero(x)        ((x).qword1 == 0 && (x).qword2 == 0)
+#define   SetMemHndlZero(x)       do {(x).qword1 = 0;(x).qword2 = 0;} while (0)
+#define   NotRegistered(x)        IsMemHndlZero(GetMemHndl(x))
+
+#define   GetMemHndlFromBlockHeader(x) MEMPOOL_GetBlockMemHndl(x)
+#define   GetSizeFromBlockHeader(x)    MEMPOOL_GetBlockSize(x)
+
+#define CmiGetMsgSize(m)     ((CmiMsgHeaderExt*)m)->size
+#define CmiSetMsgSize(m,s)   ((((CmiMsgHeaderExt*)m)->size)=(s))
+#define CmiGetMsgSeq(m)      ((CmiMsgHeaderExt*)m)->seq
+#define CmiSetMsgSeq(m, s)   ((((CmiMsgHeaderExt*)m)->seq) = (s))
+
+#define ALIGNBUF                64
+
+/* =======Beginning of Definitions of Performance-Specific Macros =======*/
+/* If SMSG is not used */
+
+#define FMA_PER_CORE  1024
+#define FMA_BUFFER_SIZE 1024
+
+/* If SMSG is used */
+static int  SMSG_MAX_MSG = 1024;
+#define SMSG_MAX_CREDIT    72
+
+#define MSGQ_MAXSIZE       2048
+
+/* large message transfer with FMA or BTE */
+#if ! REMOTE_EVENT
+#define LRTS_GNI_RDMA_THRESHOLD  1024 
+#else
+   /* remote events only work with RDMA */
+#define LRTS_GNI_RDMA_THRESHOLD  0 
+#endif
+
+#if CMK_SMP
+static int  REMOTE_QUEUE_ENTRIES=163840; 
+static int LOCAL_QUEUE_ENTRIES=163840; 
+#else
+static int  REMOTE_QUEUE_ENTRIES=20480;
+static int LOCAL_QUEUE_ENTRIES=20480; 
+#endif
+
+#define BIG_MSG_TAG             0x26
+#define PUT_DONE_TAG            0x28
+#define DIRECT_PUT_DONE_TAG     0x29
+#define ACK_TAG                 0x30
+/* SMSG is data message */
+#define SMALL_DATA_TAG          0x31
+/* SMSG is a control message to initialize a BTE */
+#define LMSG_INIT_TAG           0x33 
+#define LMSG_PERSISTENT_INIT_TAG           0x34 
+#define LMSG_OOB_INIT_TAG       0x35
+
+#define DEBUG
+#ifdef GNI_RC_CHECK
+#undef GNI_RC_CHECK
+#endif
+#ifdef DEBUG
+#define GNI_RC_CHECK(msg,rc) do { if(rc != GNI_RC_SUCCESS) {           printf("[%d] %s; err=%s\n",CmiMyPe(),msg,gni_err_str[rc]); fflush(stdout); CmiAbort("GNI_RC_CHECK"); } } while(0)
+#else
+#define GNI_RC_CHECK(msg,rc)
+#endif
+
+#define ALIGN64(x)       (size_t)((~63)&((x)+63))
+//#define ALIGN4(x)        (size_t)((~3)&((x)+3)) 
+#define ALIGNHUGEPAGE(x)   (size_t)((~(_tlbpagesize-1))&((x)+_tlbpagesize-1))
+
+static int useStaticMSGQ = 0;
+static int useStaticFMA = 0;
+static int mysize, myrank;
+static gni_nic_handle_t   nic_hndl;
+
+typedef struct {
+    gni_mem_handle_t mdh;
+    uint64_t addr;
+} mdh_addr_t ;
+// this is related to dynamic SMSG
+
+typedef struct mdh_addr_list{
+    gni_mem_handle_t mdh;
+   void *addr;
+    struct mdh_addr_list *next;
+}mdh_addr_list_t;
+
+static unsigned int         smsg_memlen;
+gni_smsg_attr_t    **smsg_local_attr_vec = 0;
+mdh_addr_t          setup_mem;
+mdh_addr_t          *smsg_connection_vec = 0;
+gni_mem_handle_t    smsg_connection_memhndl;
+static int          smsg_expand_slots = 10;
+static int          smsg_available_slot = 0;
+static void         *smsg_mailbox_mempool = 0;
+mdh_addr_list_t     *smsg_dynamic_list = 0;
+
+static void             *smsg_mailbox_base;
+gni_msgq_attr_t         msgq_attrs;
+gni_msgq_handle_t       msgq_handle;
+gni_msgq_ep_attr_t      msgq_ep_attrs;
+gni_msgq_ep_attr_t      msgq_ep_attrs_size;
+
+/* =====Beginning of Declarations of Machine Specific Variables===== */
+static int cookie;
+static int modes = 0;
+static gni_cq_handle_t       smsg_rx_cqh = NULL;      // smsg send
+static gni_cq_handle_t       default_tx_cqh = NULL;   // bind to endpoint
+static gni_cq_handle_t       rdma_tx_cqh = NULL;      // rdma - local event
+static gni_cq_handle_t       highprior_rdma_tx_cqh = NULL;      // rdma - local event
+static gni_cq_handle_t       rdma_rx_cqh = NULL;      // mempool - remote event
+static gni_cq_handle_t       highpriority_rx_cqh = NULL;      // mempool - remote event
+static gni_ep_handle_t       *ep_hndl_array;
+
+static CmiNodeLock           *ep_lock_array;
+static CmiNodeLock           default_tx_cq_lock; 
+static CmiNodeLock           rdma_tx_cq_lock; 
+static CmiNodeLock           global_gni_lock; 
+static CmiNodeLock           rx_cq_lock;
+static CmiNodeLock           smsg_mailbox_lock;
+static CmiNodeLock           smsg_rx_cq_lock;
+static CmiNodeLock           *mempool_lock;
+//#define     CMK_WITH_STATS      1
+typedef struct msg_list
+{
+    uint32_t destNode;
+    uint32_t size;
+    void *msg;
+    uint8_t tag;
+#if CMK_WITH_STATS
+    double  creation_time;
+#endif
+}MSG_LIST;
+
+
+typedef struct control_msg
+{
+    uint64_t            source_addr;    /* address from the start of buffer  */
+    uint64_t            dest_addr;      /* address from the start of buffer */
+    int                 total_length;   /* total length */
+    int                 length;         /* length of this packet */
+#if REMOTE_EVENT
+    int                 ack_index;      /* index from integer to address */
+#endif
+    int     seq_id;         //big message   0 meaning single message
+    gni_mem_handle_t    source_mem_hndl;
+#if PERSISTENT_GET_BASE
+    gni_mem_handle_t    dest_mem_hndl;
+#endif
+    struct control_msg  *next;
+} CONTROL_MSG;
+
+#define CONTROL_MSG_SIZE       (sizeof(CONTROL_MSG)-sizeof(void*))
+
+typedef struct ack_msg
+{
+    uint64_t            source_addr;    /* address from the start of buffer  */
+#if ! USE_LRTS_MEMPOOL
+    gni_mem_handle_t    source_mem_hndl;
+    int                 length;          /* total length */
+#endif
+    struct ack_msg     *next;
+} ACK_MSG;
+
+#define ACK_MSG_SIZE       (sizeof(ACK_MSG)-sizeof(void*))
+
+#if CMK_DIRECT
+typedef struct{
+    uint64_t    handler_addr;
+}CMK_DIRECT_HEADER;
+
+typedef struct {
+    char core[CmiMsgHeaderSizeBytes];
+    uint64_t handler;
+}cmidirectMsg;
+
+//SYH
+CpvDeclare(int, CmiHandleDirectIdx);
+void CmiHandleDirectMsg(cmidirectMsg* msg)
+{
+
+    CmiDirectUserHandle *_handle= (CmiDirectUserHandle*)(msg->handler);
+   (*(_handle->callbackFnPtr))(_handle->callbackData);
+   CmiFree(msg);
+}
+
+void CmiDirectInit()
+{
+    CpvInitialize(int,  CmiHandleDirectIdx);
+    CpvAccess(CmiHandleDirectIdx) = CmiRegisterHandler( (CmiHandler) CmiHandleDirectMsg);
+}
+
+#endif
+typedef struct  rmda_msg
+{
+    int                   destNode;
+#if REMOTE_EVENT
+    int                   ack_index;
+#endif
+    gni_post_descriptor_t *pd;
+}RDMA_REQUEST;
+
+
+#define SMP_LOCKS                       1
+#define ONE_SEND_QUEUE                  0
+typedef PCQueue BufferList;
+typedef struct  msg_list_index
+{
+    PCQueue       sendSmsgBuf;
+#if  SMP_LOCKS
+    CmiNodeLock   lock;
+    int           pushed;
+    int           destpe;
+#endif
+} MSG_LIST_INDEX;
+char                *destpe_avail;
+PCQueue sendRdmaBuf;
+PCQueue sendHighPriorBuf;
+// buffered send queue
+#if ! ONE_SEND_QUEUE
+typedef struct smsg_queue
+{
+    MSG_LIST_INDEX   *smsg_msglist_index;
+    int               smsg_head_index;
+#if  SMP_LOCKS
+    PCQueue     nonEmptyQueues;
+#endif
+} SMSG_QUEUE;
+#else
+typedef struct smsg_queue
+{
+    PCQueue       sendMsgBuf;
+}  SMSG_QUEUE;
+#endif
+
+SMSG_QUEUE                  smsg_queue;
+#if CMK_USE_OOB
+SMSG_QUEUE                  smsg_oob_queue;
+#define SEND_OOB_SMSG(x)            SendBufferMsg(&x, NULL);
+#define PUMP_LOCAL_HIGHPRIORITY    STATS_PUMPLOCALTRANSACTIONS_RDMA_TIME(PumpLocalTransactions(highprior_rdma_tx_cqh,  rdma_tx_cq_lock)); 
+#else
+#define SEND_OOB_SMSG(x)            
+#define PUMP_LOCAL_HIGHPRIORITY     
+#endif
+
+#define FreeMsgList(d)   free(d);
+#define MallocMsgList(d)  d = ((MSG_LIST*)malloc(sizeof(MSG_LIST)));
+
+#define FreeControlMsg(d)      free(d);
+#define MallocControlMsg(d)    d = ((CONTROL_MSG*)malloc(sizeof(CONTROL_MSG)));
+
+#define FreeAckMsg(d)      free(d);
+#define MallocAckMsg(d)    d = ((ACK_MSG*)malloc(sizeof(ACK_MSG)));
+
+#define FreeRdmaRequest(d)       free(d);
+#define MallocRdmaRequest(d)     d = ((RDMA_REQUEST*)malloc(sizeof(RDMA_REQUEST)));   
+/* reuse gni_post_descriptor_t */
+static gni_post_descriptor_t *post_freelist=0;
+
+#define FreePostDesc(d)     free(d);
+#define MallocPostDesc(d)   d = ((gni_post_descriptor_t*)malloc(sizeof(gni_post_descriptor_t))); _MEMCHECK(d);
+
+
+/* LrtsSent is called but message can not be sent by SMSGSend because of mailbox full or no credit */
+static int      buffered_smsg_counter = 0;
+
+/* SmsgSend return success but message sent is not confirmed by remote side */
+static MSG_LIST *buffered_fma_head = 0;
+static MSG_LIST *buffered_fma_tail = 0;
+
+/* functions  */
+#define IsFree(a,ind)  !( a& (1<<(ind) ))
+#define SET_BITS(a,ind) a = ( a | (1<<(ind )) )
+#define Reset(a,ind) a = ( a & (~(1<<(ind))) )
+
+CpvDeclare(mempool_type*, mempool);
+
+#if CMK_PERSISTENT_COMM_PUT
+CpvDeclare(mempool_type*, persistent_mempool);
+#endif
+
+#if REMOTE_EVENT
+/* ack pool for remote events */
+
+static int  SHIFT   =           18;
+#define INDEX_MASK              ((1<<(32-SHIFT-1)) - 1)
+#define RANK_MASK               ((1<<SHIFT) - 1)
+#define ACK_EVENT(idx)          ((((idx) & INDEX_MASK)<<SHIFT) | myrank)
+
+#define GET_TYPE(evt)           (((evt) >> 31) & 1)
+#define GET_RANK(evt)           ((evt) & RANK_MASK)
+#define GET_INDEX(evt)          (((evt) >> SHIFT) & INDEX_MASK)
+
+#define PERSIST_EVENT(idx)      ( (1<<31) | (((idx) & INDEX_MASK)<<SHIFT) | myrank)
+#define DIRECT_EVENT(idx)      ( (1<<31) | (((idx) & INDEX_MASK)<<SHIFT) | myrank)
+
+#if CMK_SMP
+#define INIT_SIZE                4096
+#else
+#define INIT_SIZE                1024
+#endif
+
+struct IndexStruct {
+void *addr;
+int next;
+int type;     // 1: ACK   2: Persistent
+};
+
+typedef struct IndexPool {
+    struct IndexStruct   *indexes;
+    int                   size;
+    int                   freehead;
+    CmiNodeLock           lock;
+} IndexPool;
+
+static IndexPool  ackPool;
+#if CMK_PERSISTENT_COMM_PUT
+static IndexPool  persistPool;
+#else
+#define persistPool ackPool 
+#endif
+
+#define  GetIndexType(pool, s)             (pool.indexes[s].type)
+#define  GetIndexAddress(pool, s)          (pool.indexes[s].addr)
+
+static void IndexPool_init(IndexPool *pool)
+{
+    int i;
+    if ((1<<SHIFT) < mysize) 
+        CmiAbort("Charm++ Error: Remote event's rank field overflow.");
+    pool->size = INIT_SIZE;
+    if ( (1<<(31-SHIFT)) < pool->size) CmiAbort("IndexPool_init: pool initial size is too big.");
+    pool->indexes = (struct IndexStruct *)malloc(pool->size*sizeof(struct IndexStruct));
+    for (i=0; i<pool->size-1; i++) {
+        pool->indexes[i].next = i+1;
+        pool->indexes[i].type = 0;
+    }
+    pool->indexes[i].next = -1;
+    pool->indexes[i].type = 0;
+    pool->freehead = 0;
+#if MULTI_THREAD_SEND || CMK_PERSISTENT_COMM_PUT
+    pool->lock  = CmiCreateLock();
+#else
+    pool->lock  = 0;
+#endif
+}
+
+static
+inline int IndexPool_getslot(IndexPool *pool, void *addr, int type)
+{
+    int s, i;
+#if MULTI_THREAD_SEND  
+    CmiLock(pool->lock);
+#endif
+    CmiAssert(type == 1 || type == 2);
+    s = pool->freehead;
+    if (s == -1) {
+        int newsize = pool->size * 2;
+        //printf("[%d] IndexPool_getslot %p expand to: %d\n", myrank, pool, newsize);
+        if (newsize > (1<<(32-SHIFT-1))) {
+            static int warned = 0;
+            if (!warned)
+              printf("[%d] Warning: IndexPool_getslot %p overflow when expanding to: %d\n", myrank, pool, newsize);
+            warned = 1;
+            return -1;
+            CmiAbort("IndexPool for remote events overflows, try compile Charm++ with remote event disabled.");
+        }
+        struct IndexStruct *old_ackpool = pool->indexes;
+        pool->indexes = (struct IndexStruct *)malloc(newsize*sizeof(struct IndexStruct));
+        memcpy(pool->indexes, old_ackpool, pool->size*sizeof(struct IndexStruct));
+        for (i=pool->size; i<newsize-1; i++) {
+            pool->indexes[i].next = i+1;
+            pool->indexes[i].type = 0;
+        }
+        pool->indexes[i].next = -1;
+        pool->indexes[i].type = 0;
+        pool->freehead = pool->size;
+        s = pool->size;
+        pool->size = newsize;
+        free(old_ackpool);
+    }
+    pool->freehead = pool->indexes[s].next;
+    pool->indexes[s].addr = addr;
+    CmiAssert(pool->indexes[s].type == 0);
+    pool->indexes[s].type = type;
+#if MULTI_THREAD_SEND
+    CmiUnlock(pool->lock);
+#endif
+    return s;
+}
+
+static
+inline  void IndexPool_freeslot(IndexPool *pool, int s)
+{
+    CmiAssert(s>=0 && s<pool->size);
+#if MULTI_THREAD_SEND
+    CmiLock(pool->lock);
+#endif
+    pool->indexes[s].next = pool->freehead;
+    pool->indexes[s].type = 0;
+    pool->freehead = s;
+#if MULTI_THREAD_SEND
+    CmiUnlock(pool->lock);
+#endif
+}
+
+
+#endif
+
+/* =====Beginning of Definitions of Message-Corruption Related Macros=====*/
+#define CMI_MAGIC(msg)                   ((CmiMsgHeaderBasic *)msg)->magic
+#define CHARM_MAGIC_NUMBER               126
+
+#if CMK_ERROR_CHECKING
+extern unsigned char computeCheckSum(unsigned char *data, int len);
+static int checksum_flag = 0;
+#define CMI_SET_CHECKSUM(msg, len)      \
+        if (checksum_flag)  {   \
+          ((CmiMsgHeaderBasic *)msg)->cksum = 0;        \
+          ((CmiMsgHeaderBasic *)msg)->cksum = computeCheckSum((unsigned char*)msg, len);        \
+        }
+#define CMI_CHECK_CHECKSUM(msg, len)    \
+        if (checksum_flag)      \
+          if (computeCheckSum((unsigned char*)msg, len) != 0)   \
+            CmiAbort("Fatal error: checksum doesn't agree!\n");
+#else
+#define CMI_SET_CHECKSUM(msg, len)
+#define CMI_CHECK_CHECKSUM(msg, len)
+#endif
+/* =====End of Definitions of Message-Corruption Related Macros=====*/
+
+static int print_stats = 0;
+static int stats_off = 0;
+void CmiTurnOnStats()
+{
+    stats_off = 0;
+    //CmiPrintf("[%d][%d:%d]+++++++++++ turning on stats \n", CmiMyNode(), CmiMyPe(), CmiMyRank());
+}
+
+void CmiTurnOffStats()
+{
+    stats_off = 1;
+}
+
+#define IS_PUT(type)    (type == GNI_POST_FMA_PUT || type == GNI_POST_RDMA_PUT)
+
+#if CMK_WITH_STATS
+FILE *counterLog = NULL;
+typedef struct comm_thread_stats
+{
+    uint64_t  smsg_data_count;
+    uint64_t  lmsg_init_count;
+    uint64_t  ack_count;
+    uint64_t  big_msg_ack_count;
+    uint64_t  smsg_count;
+    uint64_t  direct_put_done_count;
+    uint64_t  put_done_count;
+    //times of calling SmsgSend
+    uint64_t  try_smsg_data_count;
+    uint64_t  try_lmsg_init_count;
+    uint64_t  try_ack_count;
+    uint64_t  try_big_msg_ack_count;
+    uint64_t  try_direct_put_done_count;
+    uint64_t  try_put_done_count;
+    uint64_t  try_smsg_count;
+    
+    double    max_time_in_send_buffered_smsg;
+    double    all_time_in_send_buffered_smsg;
+
+    uint64_t  rdma_get_count, rdma_put_count;
+    uint64_t  try_rdma_get_count, try_rdma_put_count;
+    double    max_time_from_control_to_rdma_init;
+    double    all_time_from_control_to_rdma_init;
+
+    double    max_time_from_rdma_init_to_rdma_done;
+    double    all_time_from_rdma_init_to_rdma_done;
+
+    int      count_in_PumpNetwork;
+    double   time_in_PumpNetwork;
+    double   max_time_in_PumpNetwork;
+    int      count_in_SendBufferMsg_smsg;
+    double   time_in_SendBufferMsg_smsg;
+    double   max_time_in_SendBufferMsg_smsg;
+    int      count_in_SendRdmaMsg;
+    double   time_in_SendRdmaMsg;
+    double   max_time_in_SendRdmaMsg;
+    int      count_in_PumpRemoteTransactions;
+    double   time_in_PumpRemoteTransactions;
+    double   max_time_in_PumpRemoteTransactions;
+    int      count_in_PumpLocalTransactions_rdma;
+    double   time_in_PumpLocalTransactions_rdma;
+    double   max_time_in_PumpLocalTransactions_rdma;
+    int      count_in_PumpDatagramConnection;
+    double   time_in_PumpDatagramConnection;
+    double   max_time_in_PumpDatagramConnection;
+} Comm_Thread_Stats;
+
+static Comm_Thread_Stats   comm_stats;
+
+static char *counters_dirname = "counters";
+
+static void init_comm_stats()
+{
+  memset(&comm_stats, 0, sizeof(Comm_Thread_Stats));
+  if (print_stats){
+      char ln[200];
+      int code = mkdir(counters_dirname, 00777); 
+      sprintf(ln,"%s/statistics.%d.%d", counters_dirname, mysize, myrank);
+      counterLog=fopen(ln,"w");
+      if (counterLog == NULL) CmiAbort("Counter files open failed");
+  }
+}
+
+#define SMSG_CREATION( x ) if(print_stats) { x->creation_time = CmiWallTimer(); }
+
+#define SMSG_SENT_DONE(creation_time, tag)  \
+        if (print_stats && !stats_off) {   if( tag == SMALL_DATA_TAG) comm_stats.smsg_data_count++;  \
+            else  if( tag == LMSG_INIT_TAG || tag == LMSG_OOB_INIT_TAG) comm_stats.lmsg_init_count++;  \
+            else  if( tag == ACK_TAG) comm_stats.ack_count++;  \
+            else  if( tag == BIG_MSG_TAG) comm_stats.big_msg_ack_count++;  \
+            else  if( tag == PUT_DONE_TAG ) comm_stats.put_done_count++;  \
+            else  if( tag == DIRECT_PUT_DONE_TAG ) comm_stats.direct_put_done_count++;  \
+            comm_stats.smsg_count++; \
+            double inbuff_time = CmiWallTimer() - creation_time;   \
+            if(inbuff_time > comm_stats.max_time_in_send_buffered_smsg) comm_stats.max_time_in_send_buffered_smsg= inbuff_time; \
+            comm_stats.all_time_in_send_buffered_smsg += inbuff_time;  \
+        }
+
+#define SMSG_TRY_SEND(tag)  \
+        if (print_stats && !stats_off){   if( tag == SMALL_DATA_TAG) comm_stats.try_smsg_data_count++;  \
+            else  if( tag == LMSG_INIT_TAG || tag == LMSG_OOB_INIT_TAG) comm_stats.try_lmsg_init_count++;  \
+            else  if( tag == ACK_TAG) comm_stats.try_ack_count++;  \
+            else  if( tag == BIG_MSG_TAG) comm_stats.try_big_msg_ack_count++;  \
+            else  if( tag == PUT_DONE_TAG ) comm_stats.try_put_done_count++;  \
+            else  if( tag == DIRECT_PUT_DONE_TAG ) comm_stats.try_direct_put_done_count++;  \
+            comm_stats.try_smsg_count++; \
+        }
+
+#define  RDMA_TRY_SEND(type)        if (print_stats && !stats_off) {IS_PUT(type)?comm_stats.try_rdma_put_count++:comm_stats.try_rdma_get_count++;}
+
+#define  RDMA_TRANS_DONE(x)      \
+         if (print_stats && !stats_off) {  double rdma_trans_time = CmiWallTimer() - x ; \
+             if(rdma_trans_time > comm_stats.max_time_from_rdma_init_to_rdma_done) comm_stats.max_time_from_rdma_init_to_rdma_done = rdma_trans_time; \
+             comm_stats.all_time_from_rdma_init_to_rdma_done += rdma_trans_time; \
+         }
+
+#define  RDMA_TRANS_INIT(type, x)      \
+         if (print_stats && !stats_off) {   IS_PUT(type)?comm_stats.rdma_put_count++:comm_stats.rdma_get_count++;  \
+             double rdma_trans_time = CmiWallTimer() - x ; \
+             if(rdma_trans_time > comm_stats.max_time_from_control_to_rdma_init) comm_stats.max_time_from_control_to_rdma_init = rdma_trans_time; \
+             comm_stats.all_time_from_control_to_rdma_init += rdma_trans_time; \
+         }
+
+#define STATS_PUMPNETWORK_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_PumpNetwork++;        \
+          comm_stats.time_in_PumpNetwork += t;   \
+          if (t>comm_stats.max_time_in_PumpNetwork)      \
+              comm_stats.max_time_in_PumpNetwork = t;    \
+        }
+
+#define STATS_PUMPREMOTETRANSACTIONS_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_PumpRemoteTransactions ++;        \
+          comm_stats.time_in_PumpRemoteTransactions += t;   \
+          if (t>comm_stats.max_time_in_PumpRemoteTransactions)      \
+              comm_stats.max_time_in_PumpRemoteTransactions = t;    \
+        }
+
+#define STATS_PUMPLOCALTRANSACTIONS_RDMA_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_PumpLocalTransactions_rdma ++;        \
+          comm_stats.time_in_PumpLocalTransactions_rdma += t;   \
+          if (t>comm_stats.max_time_in_PumpLocalTransactions_rdma)      \
+              comm_stats.max_time_in_PumpLocalTransactions_rdma = t;    \
+        }
+
+#define STATS_SEND_SMSGS_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_SendBufferMsg_smsg ++;        \
+          comm_stats.time_in_SendBufferMsg_smsg += t;   \
+          if (t>comm_stats.max_time_in_SendBufferMsg_smsg)      \
+              comm_stats.max_time_in_SendBufferMsg_smsg = t;    \
+        }
+
+#define STATS_SENDRDMAMSG_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_SendRdmaMsg ++;        \
+          comm_stats.time_in_SendRdmaMsg += t;   \
+          if (t>comm_stats.max_time_in_SendRdmaMsg)      \
+              comm_stats.max_time_in_SendRdmaMsg = t;    \
+        }
+
+#define STATS_PUMPDATAGRAMCONNECTION_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_PumpDatagramConnection ++;        \
+          comm_stats.time_in_PumpDatagramConnection += t;   \
+          if (t>comm_stats.max_time_in_PumpDatagramConnection)      \
+              comm_stats.max_time_in_PumpDatagramConnection = t;    \
+        }
+
+static void print_comm_stats()
+{
+    fprintf(counterLog, "Node[%d] SMSG time in buffer\t[total:%f\tmax:%f\tAverage:%f](milisecond)\n", myrank, 1000.0*comm_stats.all_time_in_send_buffered_smsg, 1000.0*comm_stats.max_time_in_send_buffered_smsg, 1000.0*comm_stats.all_time_in_send_buffered_smsg/comm_stats.smsg_count);
+    fprintf(counterLog, "Node[%d] Smsg  Msgs  \t[Total:%lld\t Data:%lld\t Lmsg_Init:%lld\t ACK:%lld\t BIG_MSG_ACK:%lld Direct_put_done:%lld\t Persistent_put_done:%lld]\n", myrank, 
+            comm_stats.smsg_count, comm_stats.smsg_data_count, comm_stats.lmsg_init_count, 
+            comm_stats.ack_count, comm_stats.big_msg_ack_count, comm_stats.direct_put_done_count, comm_stats.put_done_count);
+    
+    fprintf(counterLog, "Node[%d] SmsgSendCalls\t[Total:%lld\t Data:%lld\t Lmsg_Init:%lld\t ACK:%lld\t BIG_MSG_ACK:%lld Direct_put_done:%lld\t Persistent_put_done:%lld]\n\n", myrank, 
+            comm_stats.try_smsg_count, comm_stats.try_smsg_data_count, comm_stats.try_lmsg_init_count, 
+            comm_stats.try_ack_count, comm_stats.try_big_msg_ack_count, comm_stats.try_direct_put_done_count, comm_stats.try_put_done_count);
+
+    fprintf(counterLog, "Node[%d] Rdma Transaction [count (GET/PUT):%lld %lld\t calls (GET/PUT):%lld %lld]\n", myrank, comm_stats.rdma_get_count, comm_stats.rdma_put_count, comm_stats.try_rdma_get_count, comm_stats.try_rdma_put_count);
+    fprintf(counterLog, "Node[%d] Rdma time from control arrives to rdma init [Total:%f\tMAX:%f\t Average:%f](milisecond)\n", myrank, 1000.0*comm_stats.all_time_from_control_to_rdma_init, 1000.0*comm_stats.max_time_from_control_to_rdma_init, 1000.0*comm_stats.all_time_from_control_to_rdma_init/(comm_stats.rdma_get_count+comm_stats.rdma_put_count)); 
+    fprintf(counterLog, "Node[%d] Rdma time from init to rdma done [Total:%f\tMAX:%f\t Average:%f](milisecond)\n\n", myrank,1000.0*comm_stats.all_time_from_rdma_init_to_rdma_done, 1000.0*comm_stats.max_time_from_rdma_init_to_rdma_done, 1000.0*comm_stats.all_time_from_rdma_init_to_rdma_done/(comm_stats.rdma_get_count+comm_stats.rdma_put_count));
+
+
+    fprintf(counterLog, "                             count\ttotal(s)\tmax(s)\taverage(us)\n");
+    fprintf(counterLog, "PumpNetworkSmsg:              %d\t%.6f\t%.6f\t%.6f\n", comm_stats.count_in_PumpNetwork, comm_stats.time_in_PumpNetwork, comm_stats.max_time_in_PumpNetwork, comm_stats.time_in_PumpNetwork*1e6/comm_stats.count_in_PumpNetwork);
+    fprintf(counterLog, "PumpRemoteTransactions:       %d\t%.6f\t%.6f\t%.6f\n", comm_stats.count_in_PumpRemoteTransactions, comm_stats.time_in_PumpRemoteTransactions, comm_stats.max_time_in_PumpRemoteTransactions, comm_stats.time_in_PumpRemoteTransactions*1e6/comm_stats.count_in_PumpRemoteTransactions);
+    fprintf(counterLog, "PumpLocalTransactions(RDMA):  %d\t%.6f\t%.6f\t%.6f\n", comm_stats.count_in_PumpLocalTransactions_rdma, comm_stats.time_in_PumpLocalTransactions_rdma, comm_stats.max_time_in_PumpLocalTransactions_rdma, comm_stats.time_in_PumpLocalTransactions_rdma*1e6/comm_stats.count_in_PumpLocalTransactions_rdma);
+    fprintf(counterLog, "SendBufferMsg (SMSG):         %d\t%.6f\t%.6f\t%.6f\n",  comm_stats.count_in_SendBufferMsg_smsg, comm_stats.time_in_SendBufferMsg_smsg, comm_stats.max_time_in_SendBufferMsg_smsg, comm_stats.time_in_SendBufferMsg_smsg*1e6/comm_stats.count_in_SendBufferMsg_smsg);
+    fprintf(counterLog, "SendRdmaMsg:                  %d\t%.6f\t%.6f\t%.6f\n",  comm_stats.count_in_SendRdmaMsg, comm_stats.time_in_SendRdmaMsg, comm_stats.max_time_in_SendRdmaMsg, comm_stats.time_in_SendRdmaMsg*1e6/comm_stats.count_in_SendRdmaMsg);
+    if (useDynamicSMSG)
+    fprintf(counterLog, "PumpDatagramConnection:                  %d\t%.6f\t%.6f\t%.6f\n",  comm_stats.count_in_PumpDatagramConnection, comm_stats.time_in_PumpDatagramConnection, comm_stats.max_time_in_PumpDatagramConnection, comm_stats.time_in_PumpDatagramConnection*1e6/comm_stats.count_in_PumpDatagramConnection);
+
+    fclose(counterLog);
+}
+
+#else
+#define STATS_PUMPNETWORK_TIME(x)                  x
+#define STATS_SEND_SMSGS_TIME(x)                   x
+#define STATS_PUMPREMOTETRANSACTIONS_TIME(x)       x
+#define STATS_PUMPLOCALTRANSACTIONS_RDMA_TIME(x)   x
+#define STATS_SENDRDMAMSG_TIME(x)                  x
+#define STATS_PUMPDATAGRAMCONNECTION_TIME(x)       x
+#endif
+
+static void
+allgather(void *in,void *out, int len)
+{
+    static int *ivec_ptr=NULL,already_called=0,job_size=0;
+    int i,rc;
+    int my_rank;
+    char *tmp_buf,*out_ptr;
+
+    if(!already_called) {
+
+        rc = PMI_Get_size(&job_size);
+        CmiAssert(rc == PMI_SUCCESS);
+        rc = PMI_Get_rank(&my_rank);
+        CmiAssert(rc == PMI_SUCCESS);
+
+        ivec_ptr = (int *)malloc(sizeof(int) * job_size);
+        CmiAssert(ivec_ptr != NULL);
+
+        rc = PMI_Allgather(&my_rank,ivec_ptr,sizeof(int));
+        CmiAssert(rc == PMI_SUCCESS);
+
+        already_called = 1;
+
+    }
+
+    tmp_buf = (char *)malloc(job_size * len);
+    CmiAssert(tmp_buf);
+
+    rc = PMI_Allgather(in,tmp_buf,len);
+    CmiAssert(rc == PMI_SUCCESS);
+
+    out_ptr = out;
+
+    for(i=0;i<job_size;i++) {
+
+        memcpy(&out_ptr[len * ivec_ptr[i]],&tmp_buf[i * len],len);
+
+    }
+
+    free(tmp_buf);
+}
+
+static void
+allgather_2(void *in,void *out, int len)
+{
+    //PMI_Allgather is out of order
+    int i,rc, extend_len;
+    int  rank_index;
+    char *out_ptr, *out_ref;
+    char *in2;
+
+    extend_len = sizeof(int) + len;
+    in2 = (char*)malloc(extend_len);
+
+    memcpy(in2, &myrank, sizeof(int));
+    memcpy(in2+sizeof(int), in, len);
+
+    out_ptr = (char*)malloc(mysize*extend_len);
+
+    rc = PMI_Allgather(in2, out_ptr, extend_len);
+    GNI_RC_CHECK("allgather", rc);
+
+    out_ref = out;
+
+    for(i=0;i<mysize;i++) {
+        //rank index 
+        memcpy(&rank_index, &(out_ptr[extend_len*i]), sizeof(int));
+        //copy to the rank index slot
+        memcpy(&out_ref[rank_index*len], &out_ptr[extend_len*i+sizeof(int)], len);
+    }
+
+    free(out_ptr);
+    free(in2);
+
+}
+
+static unsigned int get_gni_nic_address(int device_id)
+{
+    unsigned int address, cpu_id;
+    gni_return_t status;
+    int i, alps_dev_id=-1,alps_address=-1;
+    char *token, *p_ptr;
+
+    p_ptr = getenv("PMI_GNI_DEV_ID");
+    if (!p_ptr) {
+        status = GNI_CdmGetNicAddress(device_id, &address, &cpu_id);
+       
+        GNI_RC_CHECK("GNI_CdmGetNicAddress", status);
+    } else {
+        while ((token = strtok(p_ptr,":")) != NULL) {
+            alps_dev_id = atoi(token);
+            if (alps_dev_id == device_id) {
+                break;
+            }
+            p_ptr = NULL;
+        }
+        CmiAssert(alps_dev_id != -1);
+        p_ptr = getenv("PMI_GNI_LOC_ADDR");
+        CmiAssert(p_ptr != NULL);
+        i = 0;
+        while ((token = strtok(p_ptr,":")) != NULL) {
+            if (i == alps_dev_id) {
+                alps_address = atoi(token);
+                break;
+            }
+            p_ptr = NULL;
+            ++i;
+        }
+        CmiAssert(alps_address != -1);
+        address = alps_address;
+    }
+    return address;
+}
+
+static uint8_t get_ptag(void)
+{
+    char *p_ptr, *token;
+    uint8_t ptag;
+
+    p_ptr = getenv("PMI_GNI_PTAG");
+    CmiAssert(p_ptr != NULL);
+    token = strtok(p_ptr, ":");
+    ptag = (uint8_t)atoi(token);
+    return ptag;
+        
+}
+
+static uint32_t get_cookie(void)
+{
+    uint32_t cookie;
+    char *p_ptr, *token;
+
+    p_ptr = getenv("PMI_GNI_COOKIE");
+    CmiAssert(p_ptr != NULL);
+    token = strtok(p_ptr, ":");
+    cookie = (uint32_t)atoi(token);
+
+    return cookie;
+}
+
+#if LARGEPAGE
+
+/* directly mmap memory from hugetlbfs for large pages */
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <hugetlbfs.h>
+
+// size must be _tlbpagesize aligned
+void *my_get_huge_pages(size_t size)
+{
+    char filename[512];
+    int fd;
+    mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+    void *ptr = NULL;
+
+    snprintf(filename, sizeof(filename), "%s/charm_mempool.%d.%d", hugetlbfs_find_path_for_size(_tlbpagesize), getpid(), rand());
+    fd = open(filename, O_RDWR | O_CREAT, mode);
+    if (fd == -1) {
+        CmiAbort("my_get_huge_pages: open filed");
+    }
+    ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+    if (ptr == MAP_FAILED) ptr = NULL;
+//printf("[%d] my_get_huge_pages: %s %d %p\n", myrank, filename, size, ptr);
+    close(fd);
+    unlink(filename);
+    return ptr;
+}
+
+void my_free_huge_pages(void *ptr, int size)
+{
+//printf("[%d] my_free_huge_pages: %p %d\n", myrank, ptr, size);
+    int ret = munmap(ptr, size);
+    if (ret == -1) CmiAbort("munmap failed in my_free_huge_pages");
+}
+
+#endif
+
+/* =====Beginning of Definitions of Message-Corruption Related Macros=====*/
+/* TODO: add any that are related */
+/* =====End of Definitions of Message-Corruption Related Macros=====*/
+
+
+#include "machine-lrts.h"
+#include "machine-common-core.c"
+
+
+static int SendBufferMsg(SMSG_QUEUE *queue, SMSG_QUEUE *urgent_queue);
+static void SendRdmaMsg(PCQueue );
+static void PumpNetworkSmsg();
+static void PumpLocalTransactions(gni_cq_handle_t tx_cqh, CmiNodeLock cq_lock);
+#if CQWRITE
+static void PumpCqWriteTransactions();
+#endif
+#if REMOTE_EVENT
+static void PumpRemoteTransactions(gni_cq_handle_t);
+#endif
+
+#if MACHINE_DEBUG_LOG
+static CmiInt8 buffered_recv_msg = 0;
+int         lrts_smsg_success = 0;
+int         lrts_received_msg = 0;
+#endif
+
+static void sweep_mempool(mempool_type *mptr)
+{
+    int n = 0;
+    block_header *current = &(mptr->block_head);
+
+    printf("[n %d %d] sweep_mempool slot START.\n", myrank, n++);
+    while( current!= NULL) {
+        printf("[n %d %d] sweep_mempool slot %p size: %lld used: %d (%d %d) %lld %lld.\n", myrank, n++, current, current->size, 1<<current->used, current->msgs_in_send, current->msgs_in_recv, current->mem_hndl.qword1, current->mem_hndl.qword2);
+        current = current->block_next?(block_header *)((char*)mptr+current->block_next):NULL;
+    }
+    printf("[n %d] sweep_mempool slot END.\n", myrank);
+}
+
+inline
+static  gni_return_t deregisterMemory(mempool_type *mptr, block_header **from)
+{
+    block_header *current = *from;
+
+    //while(register_memory_size>= MAX_REG_MEM)
+    //{
+        while( current!= NULL && ((current->msgs_in_send+current->msgs_in_recv)>0 || IsMemHndlZero(current->mem_hndl) ))
+            current = current->block_next?(block_header *)((char*)mptr+current->block_next):NULL;
+
+        *from = current;
+        if(current == NULL) return GNI_RC_ERROR_RESOURCE;
+        MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(GetMemHndlFromBlockHeader(current)) , &omdh, GetSizeFromBlockHeader(current));
+        SetMemHndlZero(GetMemHndlFromBlockHeader(current));
+    //}
+    return GNI_RC_SUCCESS;
+}
+
+inline 
+static gni_return_t registerFromMempool(mempool_type *mptr, void *blockaddr, size_t size, gni_mem_handle_t  *memhndl, gni_cq_handle_t cqh )
+{
+    gni_return_t status = GNI_RC_SUCCESS;
+    //int size = GetMempoolsize(msg);
+    //void *blockaddr = GetMempoolBlockPtr(msg);
+    //gni_mem_handle_t  *memhndl =   &(GetMemHndl(msg));
+   
+    block_header *current = &(mptr->block_head);
+    while(register_memory_size>= MAX_REG_MEM)
+    {
+        status = deregisterMemory(mptr, &current);
+        if (status != GNI_RC_SUCCESS) break;
+    }
+    if(register_memory_size>= MAX_REG_MEM) return status;
+
+    MACHSTATE3(8, "mempool (%lld,%lld,%d) \n", buffered_send_msg, buffered_recv_msg, register_memory_size); 
+    while(1)
+    {
+        MEMORY_REGISTER(onesided_hnd, nic_hndl, blockaddr, size, memhndl, &omdh, cqh, status);
+        if(status == GNI_RC_SUCCESS)
+        {
+            break;
+        }
+        else if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR)
+        {
+            GNI_RC_CHECK("registerFromMempool", status);
+        }
+        else
+        {
+            status = deregisterMemory(mptr, &current);
+            if (status != GNI_RC_SUCCESS) break;
+        }
+    }; 
+    return status;
+}
+
+inline 
+static gni_return_t registerMemory(void *msg, size_t size, gni_mem_handle_t *t, gni_cq_handle_t cqh )
+{
+    static int rank = -1;
+    int i;
+    gni_return_t status;
+    mempool_type *mptr1 = CpvAccess(mempool);//mempool_type*)GetMempoolPtr(msg);
+    //mempool_type *mptr1 = (mempool_type*)GetMempoolPtr(msg);
+    mempool_type *mptr;
+
+    status = registerFromMempool(mptr1, msg, size, t, cqh);
+    if (status == GNI_RC_SUCCESS) return status;
+#if CMK_SMP 
+    for (i=0; i<CmiMyNodeSize()+1; i++) {
+      rank = (rank+1)%(CmiMyNodeSize()+1);
+      mptr = CpvAccessOther(mempool, rank);
+      if (mptr == mptr1) continue;
+      status = registerFromMempool(mptr, msg, size, t, cqh);
+      if (status == GNI_RC_SUCCESS) return status;
+    }
+#endif
+    return  GNI_RC_ERROR_RESOURCE;
+}
+
+inline
+static void buffer_small_msgs(SMSG_QUEUE *queue, void *msg, int size, int destNode, uint8_t tag)
+{
+    MSG_LIST        *msg_tmp;
+    MallocMsgList(msg_tmp);
+    msg_tmp->destNode = destNode;
+    msg_tmp->size   = size;
+    msg_tmp->msg    = msg;
+    msg_tmp->tag    = tag;
+#if CMK_WITH_STATS
+    SMSG_CREATION(msg_tmp)
+#endif
+
+#if ONE_SEND_QUEUE
+    PCQueuePush(queue->sendMsgBuf, (char*)msg_tmp);
+#else
+#if SMP_LOCKS
+    CmiLock(queue->smsg_msglist_index[destNode].lock);
+    if(queue->smsg_msglist_index[destNode].pushed == 0)
+    {
+        PCQueuePush(queue->nonEmptyQueues, (char*)&(queue->smsg_msglist_index[destNode]));
+    }
+    PCQueuePush(queue->smsg_msglist_index[destNode].sendSmsgBuf, (char*)msg_tmp);
+    CmiUnlock(queue->smsg_msglist_index[destNode].lock);
+#else
+    PCQueuePush(queue->smsg_msglist_index[destNode].sendSmsgBuf, (char*)msg_tmp);
+#endif
+#endif
+
+#if PRINT_SYH
+    buffered_smsg_counter++;
+#endif
+}
+
+inline static void print_smsg_attr(gni_smsg_attr_t     *a)
+{
+    printf("type=%d\n, credit=%d\n, size=%d\n, buf=%p, offset=%d\n", a->msg_type, a->mbox_maxcredit, a->buff_size, a->msg_buffer, a->mbox_offset);
+}
+
+inline
+static void setup_smsg_connection(int destNode)
+{
+    mdh_addr_list_t  *new_entry = 0;
+    gni_post_descriptor_t *pd;
+    gni_smsg_attr_t      *smsg_attr;
+    gni_return_t status = GNI_RC_NOT_DONE;
+    RDMA_REQUEST        *rdma_request_msg;
+    
+    if(smsg_available_slot == smsg_expand_slots)
+    {
+        new_entry = (mdh_addr_list_t*)malloc(sizeof(mdh_addr_list_t));
+        new_entry->addr = memalign(64, smsg_memlen*smsg_expand_slots);
+        bzero(new_entry->addr, smsg_memlen*smsg_expand_slots);
+
+        status = GNI_MemRegister(nic_hndl, (uint64_t)new_entry->addr,
+            smsg_memlen*smsg_expand_slots, smsg_rx_cqh,
+            GNI_MEM_READWRITE,   
+            -1,
+            &(new_entry->mdh));
+        smsg_available_slot = 0; 
+        new_entry->next = smsg_dynamic_list;
+        smsg_dynamic_list = new_entry;
+    }
+    smsg_attr = (gni_smsg_attr_t*) malloc (sizeof(gni_smsg_attr_t));
+    smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+    smsg_attr->mbox_maxcredit = SMSG_MAX_CREDIT;
+    smsg_attr->msg_maxsize = SMSG_MAX_MSG;
+    smsg_attr->mbox_offset = smsg_available_slot * smsg_memlen;
+    smsg_attr->buff_size = smsg_memlen;
+    smsg_attr->msg_buffer = smsg_dynamic_list->addr;
+    smsg_attr->mem_hndl = smsg_dynamic_list->mdh;
+    smsg_local_attr_vec[destNode] = smsg_attr;
+    smsg_available_slot++;
+    MallocPostDesc(pd);
+    pd->type            = GNI_POST_FMA_PUT;
+    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT ;
+    pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+    pd->length          = sizeof(gni_smsg_attr_t);
+    pd->local_addr      = (uint64_t) smsg_attr;
+    pd->remote_addr     = (uint64_t)&((((gni_smsg_attr_t*)(smsg_connection_vec[destNode].addr))[myrank]));
+    pd->remote_mem_hndl = smsg_connection_vec[destNode].mdh;
+    pd->src_cq_hndl     = 0;
+
+    pd->rdma_mode       = 0;
+    status = GNI_PostFma(ep_hndl_array[destNode],  pd);
+    print_smsg_attr(smsg_attr);
+    if(status == GNI_RC_ERROR_RESOURCE )
+    {
+        MallocRdmaRequest(rdma_request_msg);
+        rdma_request_msg->destNode = destNode;
+        rdma_request_msg->pd = pd;
+        /* buffer this request */
+    }
+#if PRINT_SYH
+    if(status != GNI_RC_SUCCESS)
+       printf("[%d=%d] send post FMA %s\n", myrank, destNode, gni_err_str[status]);
+    else
+        printf("[%d=%d]OK send post FMA \n", myrank, destNode);
+#endif
+}
+
+/* useDynamicSMSG */
+inline 
+static void alloc_smsg_attr( gni_smsg_attr_t *local_smsg_attr)
+{
+    gni_return_t status = GNI_RC_NOT_DONE;
+
+    if(mailbox_list->offset == mailbox_list->size)
+    {
+        dynamic_smsg_mailbox_t *new_mailbox_entry;
+        new_mailbox_entry = (dynamic_smsg_mailbox_t*)malloc(sizeof(dynamic_smsg_mailbox_t));
+        new_mailbox_entry->size = smsg_memlen*avg_smsg_connection;
+        new_mailbox_entry->mailbox_base = malloc(new_mailbox_entry->size);
+        bzero(new_mailbox_entry->mailbox_base, new_mailbox_entry->size);
+        new_mailbox_entry->offset = 0;
+        
+        status = GNI_MemRegister(nic_hndl, (uint64_t)new_mailbox_entry->mailbox_base,
+            new_mailbox_entry->size, smsg_rx_cqh,
+            GNI_MEM_READWRITE,   
+            -1,
+            &(new_mailbox_entry->mem_hndl));
+
+        GNI_RC_CHECK("register", status);
+        new_mailbox_entry->next = mailbox_list;
+        mailbox_list = new_mailbox_entry;
+    }
+    local_smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+    local_smsg_attr->mbox_maxcredit = SMSG_MAX_CREDIT;
+    local_smsg_attr->msg_maxsize = SMSG_MAX_MSG;
+    local_smsg_attr->mbox_offset = mailbox_list->offset;
+    mailbox_list->offset += smsg_memlen;
+    local_smsg_attr->buff_size = smsg_memlen;
+    local_smsg_attr->msg_buffer = mailbox_list->mailbox_base;
+    local_smsg_attr->mem_hndl = mailbox_list->mem_hndl;
+}
+
+/* useDynamicSMSG */
+inline 
+static int connect_to(int destNode)
+{
+    gni_return_t status = GNI_RC_NOT_DONE;
+    CmiAssert(smsg_connected_flag[destNode] == 0);
+    CmiAssert (smsg_attr_vector_local[destNode] == NULL);
+    smsg_attr_vector_local[destNode] = (gni_smsg_attr_t*) malloc (sizeof(gni_smsg_attr_t));
+    alloc_smsg_attr(smsg_attr_vector_local[destNode]);
+    smsg_attr_vector_remote[destNode] = (gni_smsg_attr_t*) malloc (sizeof(gni_smsg_attr_t));
+    
+    CMI_GNI_LOCK(global_gni_lock)
+    status = GNI_EpPostDataWId (ep_hndl_array[destNode], smsg_attr_vector_local[destNode], sizeof(gni_smsg_attr_t),smsg_attr_vector_remote[destNode] ,sizeof(gni_smsg_attr_t), destNode+mysize);
+    CMI_GNI_UNLOCK(global_gni_lock)
+    if (status == GNI_RC_ERROR_RESOURCE) {
+      /* possibly destNode is making connection at the same time */
+      free(smsg_attr_vector_local[destNode]);
+      smsg_attr_vector_local[destNode] = NULL;
+      free(smsg_attr_vector_remote[destNode]);
+      smsg_attr_vector_remote[destNode] = NULL;
+      mailbox_list->offset -= smsg_memlen;
+#if PRINT_SYH
+    printf("[%d] send connect_to request to %d failed\n", myrank, destNode);
+#endif
+      return 0;
+    }
+    GNI_RC_CHECK("GNI_Post", status);
+    smsg_connected_flag[destNode] = 1;
+#if PRINT_SYH
+    printf("[%d] send connect_to request to %d done\n", myrank, destNode);
+#endif
+    return 1;
+}
+
+inline 
+static gni_return_t send_smsg_message(SMSG_QUEUE *queue, int destNode, void *msg, int size, uint8_t tag, int inbuff, MSG_LIST *ptr )
+{
+    unsigned int          remote_address;
+    uint32_t              remote_id;
+    gni_return_t          status = GNI_RC_ERROR_RESOURCE;
+    gni_smsg_attr_t       *smsg_attr;
+    gni_post_descriptor_t *pd;
+    gni_post_state_t      post_state;
+    char                  *real_data; 
+
+    if (useDynamicSMSG) {
+        switch (smsg_connected_flag[destNode]) {
+        case 0: 
+            connect_to(destNode);         /* continue to case 1 */
+        case 1:                           /* pending connection, do nothing */
+            status = GNI_RC_NOT_DONE;
+            if(inbuff ==0)
+                buffer_small_msgs(queue, msg, size, destNode, tag);
+            return status;
+        }
+    }
+#if ! ONE_SEND_QUEUE
+    if(PCQueueEmpty(queue->smsg_msglist_index[destNode].sendSmsgBuf) || inbuff==1)
+#endif
+    {
+        //CMI_GNI_LOCK(smsg_mailbox_lock)
+        CMI_GNI_LOCK(default_tx_cq_lock)
+#if CMK_SMP_TRACE_COMMTHREAD
+        int oldpe = -1;
+        int oldeventid = -1;
+        if(tag == SMALL_DATA_TAG || tag == LMSG_INIT_TAG || tag == LMSG_OOB_INIT_TAG || tag == LMSG_PERSISTENT_INIT_TAG)
+        { 
+            START_EVENT();
+            if ( tag == SMALL_DATA_TAG)
+                real_data = (char*)msg; 
+            else 
+                real_data = (char*)(((CONTROL_MSG*)msg)->source_addr);
+            TRACE_COMM_GET_MSGID(real_data, &oldpe, &oldeventid);
+            TRACE_COMM_SET_COMM_MSGID(real_data);
+        }
+#endif
+#if REMOTE_EVENT
+        if (tag == LMSG_INIT_TAG || tag == LMSG_OOB_INIT_TAG || tag == LMSG_PERSISTENT_INIT_TAG) {
+            CONTROL_MSG *control_msg_tmp = (CONTROL_MSG*)msg;
+            if (control_msg_tmp->seq_id <= 0 && control_msg_tmp->ack_index == -1)
+            {
+                control_msg_tmp->ack_index = IndexPool_getslot(&ackPool, (void*)control_msg_tmp->source_addr, 1);
+                if (control_msg_tmp->ack_index == -1) {    /* table overflow */
+                    status = GNI_RC_NOT_DONE;
+                    if (inbuff ==0)
+                        buffer_small_msgs(queue, msg, size, destNode, tag);
+                    return status;
+                }
+            }
+        }
+#endif
+#if     CMK_WITH_STATS
+        SMSG_TRY_SEND(tag)
+#endif
+#if CMK_WITH_STATS
+    double              creation_time;
+    if (ptr == NULL)
+        creation_time = CmiWallTimer();
+    else
+        creation_time = ptr->creation_time;
+#endif
+
+    status = GNI_SmsgSendWTag(ep_hndl_array[destNode], NULL, 0, msg, size, 0, tag);
+#if CMK_SMP_TRACE_COMMTHREAD
+        if (oldpe != -1)  TRACE_COMM_SET_MSGID(real_data, oldpe, oldeventid);
+#endif
+        CMI_GNI_UNLOCK(default_tx_cq_lock)
+        //CMI_GNI_UNLOCK(smsg_mailbox_lock)
+        if(status == GNI_RC_SUCCESS)
+        {
+#if     CMK_WITH_STATS
+            SMSG_SENT_DONE(creation_time,tag) 
+#endif
+#if CMK_SMP_TRACE_COMMTHREAD
+            if(tag == SMALL_DATA_TAG || tag == LMSG_INIT_TAG || tag == LMSG_OOB_INIT_TAG || tag == LMSG_PERSISTENT_INIT_TAG )
+            { 
+                TRACE_COMM_CREATION(CpvAccess(projTraceStart), real_data);
+            }
+#endif
+        }else
+            status = GNI_RC_ERROR_RESOURCE;
+    }
+    if(status != GNI_RC_SUCCESS && inbuff ==0)
+        buffer_small_msgs(queue, msg, size, destNode, tag);
+    return status;
+}
+
+inline 
+static CONTROL_MSG* construct_control_msg(int size, char *msg, int seqno)
+{
+    /* construct a control message and send */
+    CONTROL_MSG         *control_msg_tmp;
+    MallocControlMsg(control_msg_tmp);
+    control_msg_tmp->source_addr = (uint64_t)msg;
+    control_msg_tmp->seq_id    = seqno;
+    control_msg_tmp->total_length = control_msg_tmp->length = ALIGN64(size); //for GET 4 bytes aligned 
+#if REMOTE_EVENT
+    control_msg_tmp->ack_index    =  -1;
+#endif
+#if     USE_LRTS_MEMPOOL
+    if(size < BIG_MSG)
+    {
+        control_msg_tmp->source_mem_hndl = GetMemHndl(msg);
+    }
+    else
+    {
+        SetMemHndlZero(control_msg_tmp->source_mem_hndl);
+        control_msg_tmp->length = size - (seqno-1)*ONE_SEG;
+        if (control_msg_tmp->length > ONE_SEG) control_msg_tmp->length = ONE_SEG;
+    }
+#else
+    SetMemHndlZero(control_msg_tmp->source_mem_hndl);
+#endif
+    return control_msg_tmp;
+}
+
+#define BLOCKING_SEND_CONTROL    0
+
+// Large message, send control to receiver, receiver register memory and do a GET, 
+// return 1 - send no success
+inline static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL_MSG  *control_msg_tmp, int inbuff, MSG_LIST *smsg_ptr, uint8_t lmsg_tag)
+{
+    gni_return_t        status  =  GNI_RC_ERROR_NOMEM;
+    uint32_t            vmdh_index  = -1;
+    int                 size;
+    int                 offset = 0;
+    uint64_t            source_addr;
+    int                 register_size; 
+    void                *msg;
+
+    size    =   control_msg_tmp->total_length;
+    source_addr = control_msg_tmp->source_addr;
+    register_size = control_msg_tmp->length;
+
+#if  USE_LRTS_MEMPOOL
+    if( control_msg_tmp->seq_id <=0 ){
+#if BLOCKING_SEND_CONTROL
+        if (inbuff == 0 && IsMemHndlZero(GetMemHndl(source_addr))) {
+            while (IsMemHndlZero(GetMemHndl(source_addr)) && buffered_send_msg + GetMempoolsize((void*)source_addr) >= MAX_BUFF_SEND)
+                LrtsAdvanceCommunication(0);
+        }
+#endif
+        if(IsMemHndlZero(GetMemHndl(source_addr))) //it is in mempool, it is possible to be de-registered by others
+        {
+            msg = (void*)source_addr;
+            if(buffered_send_msg + GetMempoolsize(msg) >= MAX_BUFF_SEND)
+            {
+                if(!inbuff)
+                    buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, lmsg_tag);
+                return GNI_RC_ERROR_NOMEM;
+            }
+            //register the corresponding mempool
+            status = registerMemory(GetMempoolBlockPtr(msg), GetMempoolsize(msg), &(GetMemHndl(msg)), rdma_rx_cqh);
+            if(status == GNI_RC_SUCCESS)
+            {
+                control_msg_tmp->source_mem_hndl = GetMemHndl(source_addr);
+            }
+        }else
+        {
+            control_msg_tmp->source_mem_hndl = GetMemHndl(source_addr);
+            status = GNI_RC_SUCCESS;
+        }
+        if(NoMsgInSend(source_addr))
+            register_size = GetMempoolsize((void*)(source_addr));
+        else
+            register_size = 0;
+    }else if(control_msg_tmp->seq_id >0)    // BIG_MSG
+    {
+        int offset = ONE_SEG*(control_msg_tmp->seq_id-1);
+        source_addr += offset;
+        size = control_msg_tmp->length;
+#if BLOCKING_SEND_CONTROL
+        if (inbuff == 0 && IsMemHndlZero(control_msg_tmp->source_mem_hndl)) {
+            while (IsMemHndlZero(control_msg_tmp->source_mem_hndl) && buffered_send_msg + size >= MAX_BUFF_SEND)
+                LrtsAdvanceCommunication(0);
+        }
+#endif
+        if (IsMemHndlZero(control_msg_tmp->source_mem_hndl)) {
+            if(buffered_send_msg + size >= MAX_BUFF_SEND)
+            {
+                if(!inbuff)
+                    buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, lmsg_tag);
+                return GNI_RC_ERROR_NOMEM;
+            }
+            status = registerMemory((void*)source_addr, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), NULL);
+            if(status == GNI_RC_SUCCESS) buffered_send_msg += ALIGN64(size);
+        }
+        else
+        {
+            status = GNI_RC_SUCCESS;
+        }
+        register_size = 0;  
+    }
+
+#if CMI_EXERT_SEND_LARGE_CAP
+    if(SEND_large_pending >= SEND_large_cap)
+    {
+        status = GNI_RC_ERROR_NOMEM;
+    }
+#endif
+    if(status == GNI_RC_SUCCESS)
+    {
+       status = send_smsg_message( queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, lmsg_tag, inbuff, smsg_ptr); 
+        if(status == GNI_RC_SUCCESS)
+        {
+#if CMI_EXERT_SEND_LARGE_CAP
+            SEND_large_pending++;
+#endif
+            buffered_send_msg += register_size;
+            if(control_msg_tmp->seq_id == 0)
+            {
+                IncreaseMsgInSend(source_addr);
+            }
+            FreeControlMsg(control_msg_tmp);
+            MACHSTATE5(8, "GO SMSG LARGE to %d (%d,%d,%d) tag=%d\n", destNode, buffered_send_msg, buffered_recv_msg, register_memory_size, lmsg_tag); 
+        }else
+            status = GNI_RC_ERROR_RESOURCE;
+
+    } else if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR)
+    {
+        CmiAbort("Memory registor for large msg\n");
+    }else 
+    {
+        status = GNI_RC_ERROR_NOMEM; 
+        if(!inbuff)
+            buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, lmsg_tag);
+    }
+    return status;
+#else
+    MEMORY_REGISTER(onesided_hnd, nic_hndl,msg, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), &omdh, NULL, status)
+    if(status == GNI_RC_SUCCESS)
+    {
+        status = send_smsg_message(queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, lmsg_tag, 0, NULL);  
+        if(status == GNI_RC_SUCCESS)
+        {
+            FreeControlMsg(control_msg_tmp);
+        }
+    } else if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR)
+    {
+        CmiAbort("Memory registor for large msg\n");
+    }else 
+    {
+        buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, lmsg_tag);
+    }
+    return status;
+#endif
+}
+inline void LrtsNotifyIdle() {}
+inline void LrtsPrepareEnvelope(char *msg, int size)
+{
+    CmiSetMsgSize(msg, size);
+    CMI_SET_CHECKSUM(msg, size);
+}
+
+CmiCommHandle LrtsSendFunc(int destNode, int destPE, int size, char *msg, int mode)
+{
+    gni_return_t        status  =   GNI_RC_SUCCESS;
+    uint8_t tag;
+    CONTROL_MSG         *control_msg_tmp;
+    int                 oob = ( mode & OUT_OF_BAND);
+    SMSG_QUEUE          *queue;
+
+    MACHSTATE5(8, "GO LrtsSendFn %d(%d) (%d,%d, %d) \n", destNode, size, buffered_send_msg, buffered_recv_msg, register_memory_size); 
+#if CMK_USE_OOB
+    queue = oob? &smsg_oob_queue : &smsg_queue;
+    tag = oob? LMSG_OOB_INIT_TAG: LMSG_INIT_TAG;
+#else
+    queue = &smsg_queue;
+    tag = LMSG_INIT_TAG;
+#endif
+
+    LrtsPrepareEnvelope(msg, size);
+
+#if PRINT_SYH
+    printf("LrtsSendFn %d==>%d, size=%d\n", myrank, destNode, size);
+#endif 
+
+#if CMK_SMP 
+    if(size <= SMSG_MAX_MSG)
+        buffer_small_msgs(queue, msg, size, destNode, SMALL_DATA_TAG);
+    else if (size < BIG_MSG) {
+        control_msg_tmp =  construct_control_msg(size, msg, 0);
+        buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, tag);
+    }
+    else {
+        CmiSetMsgSeq(msg, 0);
+        control_msg_tmp =  construct_control_msg(size, msg, 1);
+        buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, tag);
+    }
+#else   //non-smp, smp(worker sending)
+    if(size <= SMSG_MAX_MSG)
+    {
+        if (GNI_RC_SUCCESS == send_smsg_message(queue, destNode,  msg, size, SMALL_DATA_TAG, 0, NULL))
+            CmiFree(msg);
+    }
+    else if (size < BIG_MSG) {
+        control_msg_tmp =  construct_control_msg(size, msg, 0);
+        send_large_messages(queue, destNode, control_msg_tmp, 0, NULL, tag);
+    }
+    else {
+#if     USE_LRTS_MEMPOOL
+        CmiSetMsgSeq(msg, 0);
+        control_msg_tmp =  construct_control_msg(size, msg, 1);
+        send_large_messages(queue, destNode, control_msg_tmp, 0, NULL, tag);
+#else
+        control_msg_tmp =  construct_control_msg(size, msg, 0);
+        send_large_messages(queue, destNode, control_msg_tmp, 0, NULL, tag);
+#endif
+    }
+#endif
+    return 0;
+}
+
+#if 0
+// this is no different from the common code
+void LrtsSyncListSendFn(int npes, int *pes, int len, char *msg)
+{
+  int i;
+#if CMK_BROADCAST_USE_CMIREFERENCE
+  for(i=0;i<npes;i++) {
+    if (pes[i] == CmiMyPe())
+      CmiSyncSend(pes[i], len, msg);
+    else {
+      CmiReference(msg);
+      CmiSyncSendAndFree(pes[i], len, msg);
+    }
+  }
+#else
+  for(i=0;i<npes;i++) {
+    CmiSyncSend(pes[i], len, msg);
+  }
+#endif
+}
+
+CmiCommHandle LrtsAsyncListSendFn(int npes, int *pes, int len, char *msg)
+{
+  /* A better asynchronous implementation may be wanted, but at least it works */
+  CmiSyncListSendFn(npes, pes, len, msg);
+  return (CmiCommHandle) 0;
+}
+
+void LrtsFreeListSendFn(int npes, int *pes, int len, char *msg)
+{
+  if (npes == 1) {
+      CmiSyncSendAndFree(pes[0], len, msg);
+      return;
+  }
+#if CMK_PERSISTENT_COMM
+  if (CpvAccess(phs) && len > PERSIST_MIN_SIZE 
+#if CMK_SMP
+            && IS_PERSISTENT_MEMORY(msg)
+#endif
+     ){
+      int i;
+      for(i=0;i<npes;i++) {
+        if (pes[i] == CmiMyPe())
+          CmiSyncSend(pes[i], len, msg);
+        else {
+          CmiReference(msg);
+          CmiSyncSendAndFree(pes[i], len, msg);
+        }
+      }
+      CmiFree(msg);
+      return;
+  }
+#endif
+  
+#if CMK_BROADCAST_USE_CMIREFERENCE
+  CmiSyncListSendFn(npes, pes, len, msg);
+  CmiFree(msg);
+#else
+  int i;
+  for(i=0;i<npes-1;i++) {
+    CmiSyncSend(pes[i], len, msg);
+  }
+  if (npes>0)
+    CmiSyncSendAndFree(pes[npes-1], len, msg);
+  else 
+    CmiFree(msg);
+#endif
+}
+#endif
+
+static void    PumpDatagramConnection();
+static      int         event_SetupConnect = 111;
+static      int         event_PumpSmsg = 222 ;
+static      int         event_PumpTransaction = 333;
+static      int         event_PumpRdmaTransaction = 444;
+static      int         event_SendBufferSmsg = 484;
+static      int         event_SendFmaRdmaMsg = 555;
+static      int         event_AdvanceCommunication = 666;
+
+static void registerUserTraceEvents() {
+#if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && !CMK_TRACE_IN_CHARM
+    event_SetupConnect = traceRegisterUserEvent("setting up connections", -1 );
+    event_PumpSmsg = traceRegisterUserEvent("Pump network small msgs", -1);
+    event_PumpTransaction = traceRegisterUserEvent("Pump FMA/RDMA local transaction" , -1);
+    event_PumpRdmaTransaction = traceRegisterUserEvent("Pump RDMA remote event" , -1);
+    event_SendBufferSmsg = traceRegisterUserEvent("Sending buffered small msgs", -1);
+    event_SendFmaRdmaMsg = traceRegisterUserEvent("Sending buffered fma/rdma transactions", -1);
+    event_AdvanceCommunication = traceRegisterUserEvent("Worker thread in sending/receiving", -1);
+#endif
+}
+
+static void ProcessDeadlock()
+{
+    static CmiUInt8 *ptr = NULL;
+    static CmiUInt8  last = 0, mysum, sum;
+    static int count = 0;
+    gni_return_t status;
+    int i;
+
+//printf("[%d] comm thread detected hang %d %d %d\n", CmiMyPe(), smsg_send_count, smsg_recv_count, count);
+//sweep_mempool(CpvAccess(mempool));
+    if (ptr == NULL) ptr = (CmiUInt8*)malloc(mysize * sizeof(CmiUInt8));
+    mysum = smsg_send_count + smsg_recv_count;
+    MACHSTATE5(9,"Before allgather Progress Deadlock (%d,%d)  (%d,%d)(%d)\n", buffered_send_msg, register_memory_size, last, sum, count); 
+    status = PMI_Allgather(&mysum,ptr,sizeof(CmiUInt8));
+    GNI_RC_CHECK("PMI_Allgather", status);
+    sum = 0;
+    for (i=0; i<mysize; i++)  sum+= ptr[i];
+    if (last == 0 || sum == last) 
+        count++;
+    else
+        count = 0;
+    last = sum;
+    MACHSTATE5(9,"Progress Deadlock (%d,%d)  (%d,%d)(%d)\n", buffered_send_msg, register_memory_size, last, sum, count); 
+    if (count == 2) { 
+        /* detected twice, it is a real deadlock */
+        if (myrank == 0)  {
+            CmiPrintf("Charm++> Network progress engine appears to have stalled, possibly because registered memory limits have been exceeded or are too low.  Try adjusting environment variables CHARM_UGNI_MEMPOOL_MAX and CHARM_UGNI_SEND_MAX (current limits are %lld and %lld).\n", MAX_REG_MEM, MAX_BUFF_SEND);
+            CmiAbort("Fatal> Deadlock detected.");
+        }
+
+    }
+    _detected_hang = 0;
+}
+
+static void CheckProgress()
+{
+    if (smsg_send_count == last_smsg_send_count &&
+        smsg_recv_count == last_smsg_recv_count ) 
+    {
+        _detected_hang = 1;
+#if !CMK_SMP
+        if (_detected_hang) ProcessDeadlock();
+#endif
+
+    }
+    else {
+        //MACHSTATE5(9,"--Check Progress %d(%d, %d) (%d,%d)\n", mycount, buffered_send_msg, register_memory_size, smsg_send_count, smsg_recv_count); 
+        last_smsg_send_count = smsg_send_count;
+        last_smsg_recv_count = smsg_recv_count;
+        _detected_hang = 0;
+    }
+}
+
+static void set_limit()
+{
+    //if (!user_set_flag && CmiMyRank() == 0) {
+    if (CmiMyRank() == 0) {
+        int mynode = CmiPhysicalNodeID(CmiMyPe());
+        int numpes = CmiNumPesOnPhysicalNode(mynode);
+        int numprocesses = numpes / CmiMyNodeSize();
+        MAX_REG_MEM  = _totalmem / numprocesses;
+        MAX_BUFF_SEND = MAX_REG_MEM / 2;
+        if (CmiMyPe() == 0)
+           printf("mem_max = %.2fM, send_max =%.2fM\n", MAX_REG_MEM/1024.0/1024, MAX_BUFF_SEND/1024./1024);
+        if(CmiMyPe() == 0 && (smsg_memlen*mysize + _expand_mem > MAX_BUFF_SEND ||  smsg_memlen*mysize + _mempool_size > MAX_BUFF_SEND))
+        {
+             printf("Charm++> FATAL ERROR your program has risk of hanging \n please try large page or use Dynamic smsg +useDynamicSmsg or contact Charm++ developers\n");
+             CmiAbort("memory registration\n");
+        }
+    }
+}
+
+void LrtsPostCommonInit(int everReturn)
+{
+#if CMK_DIRECT
+    CmiDirectInit();
+#endif
+#if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && !CMK_TRACE_IN_CHARM
+    CpvInitialize(double, projTraceStart);
+    /* only PE 0 needs to care about registration (to generate sts file). */
+    //if (CmiMyPe() == 0) 
+    {
+        registerMachineUserEventsFunction(&registerUserTraceEvents);
+    }
+#endif
+
+#if CMK_SMP
+    CmiIdleState *s=CmiNotifyGetState();
+    CcdCallOnConditionKeep(CcdPROCESSOR_BEGIN_IDLE,(CcdVoidFn)CmiNotifyBeginIdle,(void *)s);
+    CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyStillIdle,(void *)s);
+#else
+    CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyStillIdle,NULL);
+    if (useDynamicSMSG)
+    CcdCallOnConditionKeep(CcdPERIODIC_10ms, (CcdVoidFn) PumpDatagramConnection, NULL);
+#endif
+
+#if ! LARGEPAGE
+    if (_checkProgress)
+#if CMK_SMP
+    if (CmiMyRank() == 0)
+#endif
+    CcdCallOnConditionKeep(CcdPERIODIC_2minute, (CcdVoidFn) CheckProgress, NULL);
+#endif
+#if !LARGEPAGE
+    CcdCallOnCondition(CcdTOPOLOGY_AVAIL, (CcdVoidFn)set_limit, NULL);
+#endif
+}
+
+/* this is called by worker thread */
+void LrtsPostNonLocal()
+{
+#if 1
+
+#if CMK_SMP_TRACE_COMMTHREAD
+    double startT, endT;
+#endif
+
+#if MULTI_THREAD_SEND
+    if(mysize == 1) return;
+
+    if (CmiMyRank() % 6 != 3) return;
+
+#if CMK_SMP_TRACE_COMMTHREAD
+    traceEndIdle();
+    startT = CmiWallTimer();
+#endif
+
+    CmiMachineProgressImpl();
+
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    traceUserBracketEvent(event_AdvanceCommunication, startT, endT);
+    traceBeginIdle();
+#endif
+
+#endif
+#endif
+}
+
+/* Network progress function is used to poll the network when for
+   messages. This flushes receive buffers on some  implementations*/
+#if CMK_MACHINE_PROGRESS_DEFINED
+void CmiMachineProgressImpl() {
+#if ! CMK_SMP || MULTI_THREAD_SEND
+
+    STATS_PUMPNETWORK_TIME(PumpNetworkSmsg());
+    SEND_OOB_SMSG(smsg_oob_queue)
+    PUMP_REMOTE_HIGHPRIORITY
+    PUMP_LOCAL_HIGHPRIORITY
+    POST_HIGHPRIORITY_RDMA
+
+#if 0
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 0)
+#endif
+    PumpNetworkSmsg();
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 1)
+#endif
+    PumpLocalTransactions(default_tx_cqh, default_tx_cq_lock);
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 2)
+#endif
+    PumpLocalTransactions(rdma_tx_cqh, rdma_tx_cq_lock);
+
+#if REMOTE_EVENT
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 3)
+#endif
+    PumpRemoteTransactions(rdma_rx_cqh);         // rdma_rx_cqh
+#endif
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 4)
+#endif
+    {
+#if CMK_USE_OOB
+    SendBufferMsg(&smsg_oob_queue, NULL);
+    SendBufferMsg(&smsg_queue, &smsg_oob_queue);
+#else
+    SendBufferMsg(&smsg_queue, NULL);
+#endif
+    }
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 5)
+#endif
+#if CMK_SMP
+    STATS_SENDRDMAMSG_TIME(SendRdmaMsg(sendRdmaBuf));
+#else
+    STATS_SENDRDMAMSG_TIME(SendRdmaMsg());
+#endif
+
+#endif
+#endif
+}
+#endif
+
+
+/* useDynamicSMSG */
+static void    PumpDatagramConnection()
+{
+    uint32_t          remote_address;
+    uint32_t          remote_id;
+    gni_return_t status;
+    gni_post_state_t  post_state;
+    uint64_t          datagram_id;
+    int i;
+
+   while ((status = GNI_PostDataProbeById(nic_hndl, &datagram_id)) == GNI_RC_SUCCESS)
+   {
+       if (datagram_id >= mysize) {           /* bound endpoint */
+           int pe = datagram_id - mysize;
+           CMI_GNI_LOCK(global_gni_lock)
+           status = GNI_EpPostDataTestById( ep_hndl_array[pe], datagram_id, &post_state, &remote_address, &remote_id);
+           CMI_GNI_UNLOCK(global_gni_lock)
+           if(status == GNI_RC_SUCCESS && post_state == GNI_POST_COMPLETED)
+           {
+               CmiAssert(remote_id == pe);
+               status = GNI_SmsgInit(ep_hndl_array[pe], smsg_attr_vector_local[pe], smsg_attr_vector_remote[pe]);
+               GNI_RC_CHECK("Dynamic SMSG Init", status);
+#if PRINT_SYH
+               printf("[%d] ++ Dynamic SMSG setup [%d===>%d] done\n", myrank, myrank, pe);
+#endif
+              CmiAssert(smsg_connected_flag[pe] == 1);
+               smsg_connected_flag[pe] = 2;
+           }
+       }
+       else {         /* unbound ep */
+           status = GNI_EpPostDataTestById( ep_hndl_unbound, datagram_id, &post_state, &remote_address, &remote_id);
+           if(status == GNI_RC_SUCCESS && post_state == GNI_POST_COMPLETED)
+           {
+               CmiAssert(remote_id<mysize);
+              CmiAssert(smsg_connected_flag[remote_id] <= 0);
+               status = GNI_SmsgInit(ep_hndl_array[remote_id], &send_smsg_attr, &recv_smsg_attr);
+               GNI_RC_CHECK("Dynamic SMSG Init", status);
+#if PRINT_SYH
+               printf("[%d] ++ Dynamic SMSG setup2 [%d===>%d] done\n", myrank, myrank, remote_id);
+#endif
+               smsg_connected_flag[remote_id] = 2;
+
+               alloc_smsg_attr(&send_smsg_attr);
+               status = GNI_EpPostDataWId (ep_hndl_unbound, &send_smsg_attr,  SMSG_ATTR_SIZE, &recv_smsg_attr, SMSG_ATTR_SIZE, myrank);
+               GNI_RC_CHECK("post unbound datagram", status);
+           }
+       }
+   }
+}
+
+/* pooling CQ to receive network message */
+static void PumpNetworkRdmaMsgs()
+{
+    gni_cq_entry_t      event_data;
+    gni_return_t        status;
+
+}
+
+inline 
+static void bufferRdmaMsg(PCQueue bufferqueue, int inst_id, gni_post_descriptor_t *pd, int ack_index)
+{
+    RDMA_REQUEST        *rdma_request_msg;
+    MallocRdmaRequest(rdma_request_msg);
+    rdma_request_msg->destNode = inst_id;
+    rdma_request_msg->pd = pd;
+#if REMOTE_EVENT
+    rdma_request_msg->ack_index = ack_index;
+#endif
+    PCQueuePush(bufferqueue, (char*)rdma_request_msg);
+}
+
+static void getLargeMsgRequest(void* header, uint64_t inst_id,  uint8_t tag, PCQueue);
+static void getPersistentMsgRequest(void* header, uint64_t inst_id,  uint8_t tag, PCQueue);
+static void PRINT_CONTROL(void *header)
+{
+    CONTROL_MSG *control_msg = (CONTROL_MSG *) header;
+
+    printf(" length=%d , seq_id = %d, addr = %lld:%lld:%lld  \n", control_msg->length, control_msg->seq_id, control_msg->source_addr, (control_msg->source_mem_hndl).qword1, (control_msg->source_mem_hndl).qword2 );
+#if PERSISTENT_GET_BASE
+    printf(" memhdl = %lld:%lld:%lld \n", control_msg->dest_addr, (control_msg->dest_mem_hndl).qword1, (control_msg->dest_mem_hndl).qword2);
+#endif
+}
+static void PumpNetworkSmsg()
+{
+    uint64_t            inst_id;
+    gni_cq_entry_t      event_data;
+    gni_return_t        status;
+    void                *header;
+    uint8_t             msg_tag;
+    int                 msg_nbytes;
+    void                *msg_data;
+    gni_mem_handle_t    msg_mem_hndl;
+    gni_smsg_attr_t     *smsg_attr;
+    gni_smsg_attr_t     *remote_smsg_attr;
+    int                 init_flag;
+    CONTROL_MSG         *control_msg_tmp, *header_tmp;
+    uint64_t            source_addr;
+    SMSG_QUEUE         *queue = &smsg_queue;
+    PCQueue             tmp_queue;
+#if  CMK_DIRECT
+    cmidirectMsg        *direct_msg;
+#endif
+#if CMI_PUMPNETWORKSMSG_CAP 
+    int                  recv_cnt = 0;
+    while(recv_cnt< PumpNetworkSmsg_cap) {
+#else
+    while(1) {
+#endif
+        CMI_GNI_LOCK(smsg_rx_cq_lock)
+        status =GNI_CqGetEvent(smsg_rx_cqh, &event_data);
+        CMI_GNI_UNLOCK(smsg_rx_cq_lock)
+        if(status != GNI_RC_SUCCESS) break;
+
+        inst_id = GNI_CQ_GET_INST_ID(event_data);
+#if REMOTE_EVENT
+        inst_id = GET_RANK(inst_id);      /* important */
+#endif
+        // GetEvent returns success but GetNext return not_done. caused by Smsg out-of-order transfer
+#if PRINT_SYH
+        printf("[%d] %d PumpNetworkMsgs is received from PE: %d,  status=%s\n", myrank, CmiMyRank(), inst_id,  gni_err_str[status]);
+#endif
+        if (useDynamicSMSG) {
+            /* subtle: smsg may come before connection is setup */
+            while (smsg_connected_flag[inst_id] != 2) 
+               PumpDatagramConnection();
+        }
+        msg_tag = GNI_SMSG_ANY_TAG;
+        while(1) {
+            CMI_GNI_LOCK(smsg_mailbox_lock)
+            status = GNI_SmsgGetNextWTag(ep_hndl_array[inst_id], &header, &msg_tag);
+            if (status != GNI_RC_SUCCESS)
+            {
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                break;
+            }
+#if         CMI_PUMPNETWORKSMSG_CAP
+            recv_cnt++; 
+#endif
+#if PRINT_SYH
+            printf("[%d] from %d smsg msg is received, messageid: tag=%d\n", myrank, inst_id, msg_tag);
+#endif
+            /* copy msg out and then put into queue (small message) */
+            switch (msg_tag) {
+            case SMALL_DATA_TAG:
+            {
+                START_EVENT();
+                msg_nbytes = CmiGetMsgSize(header);
+                msg_data    = CmiAlloc(msg_nbytes);
+                memcpy(msg_data, (char*)header, msg_nbytes);
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                TRACE_COMM_CREATION(EVENT_TIME(), msg_data);
+                CMI_CHECK_CHECKSUM(msg_data, msg_nbytes);
+                handleOneRecvedMsg(msg_nbytes, msg_data);
+                break;
+            }
+            case LMSG_PERSISTENT_INIT_TAG:
+            {   CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                getPersistentMsgRequest(header, inst_id, msg_tag, sendRdmaBuf);
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+                break;
+            }
+            case LMSG_INIT_TAG:
+            case LMSG_OOB_INIT_TAG:
+            {
+                tmp_queue = (msg_tag == LMSG_INIT_TAG)? sendRdmaBuf : sendHighPriorBuf; 
+#if MULTI_THREAD_SEND
+                MallocControlMsg(control_msg_tmp);
+                memcpy(control_msg_tmp, header, CONTROL_MSG_SIZE);
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                getLargeMsgRequest(control_msg_tmp, inst_id, msg_tag, tmp_queue);
+                FreeControlMsg(control_msg_tmp);
+#else
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                getLargeMsgRequest(header, inst_id, msg_tag, tmp_queue);
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+#endif
+                break;
+            }
+#if !REMOTE_EVENT && !CQWRITE
+            case ACK_TAG:   //msg fit into mempool
+            {
+                /* Get is done, release message . Now put is not used yet*/
+                void *msg = (void*)(((ACK_MSG *)header)->source_addr);
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+#if ! USE_LRTS_MEMPOOL
+                MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(((ACK_MSG *)header)->source_mem_hndl), &omdh, ((ACK_MSG *)header)->length);
+#else
+                DecreaseMsgInSend(msg);
+#endif
+                if(NoMsgInSend(msg))
+                    buffered_send_msg -= GetMempoolsize(msg);
+                MACHSTATE5(8, "GO send done to %d (%d,%d, %d) tag=%d\n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size, msg_tag); 
+                CmiFree(msg);
+#if CMI_EXERT_SEND_LARGE_CAP
+                SEND_large_pending--;
+#endif
+                break;
+            }
+#endif
+            case BIG_MSG_TAG:  //big msg, de-register, transfer next seg
+            {
+#if MULTI_THREAD_SEND
+                MallocControlMsg(header_tmp);
+                memcpy(header_tmp, header, CONTROL_MSG_SIZE);
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+#else
+                header_tmp = (CONTROL_MSG *) header;
+#endif
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+#if CMI_EXERT_SEND_LARGE_CAP
+                SEND_large_pending--;
+#endif
+                void *msg = (void*)(header_tmp->source_addr);
+                int cur_seq = CmiGetMsgSeq(msg);
+                int offset = ONE_SEG*(cur_seq+1);
+                MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(header_tmp->source_mem_hndl), &omdh, header_tmp->length);
+                buffered_send_msg -= header_tmp->length;
+                int remain_size = CmiGetMsgSize(msg) - header_tmp->length;
+                if (remain_size < 0) remain_size = 0;
+                CmiSetMsgSize(msg, remain_size);
+                if(remain_size <= 0) //transaction done
+                {
+                    CmiFree(msg);
+                }else if (header_tmp->total_length > offset)
+                {
+                    CmiSetMsgSeq(msg, cur_seq+1);
+                    control_msg_tmp = construct_control_msg(header_tmp->total_length, msg, cur_seq+1+1);
+                    control_msg_tmp->dest_addr = header_tmp->dest_addr;
+                    //send next seg
+                    send_large_messages( queue, inst_id, control_msg_tmp, 0, NULL, LMSG_INIT_TAG);
+                         // pipelining
+                    if (header_tmp->seq_id == 1) {
+                      int i;
+                      for (i=1; i<BIG_MSG_PIPELINE; i++) {
+                        int seq = cur_seq+i+2;
+                        CmiSetMsgSeq(msg, seq-1);
+                        control_msg_tmp =  construct_control_msg(header_tmp->total_length, (char *)msg, seq);
+                        control_msg_tmp->dest_addr = header_tmp->dest_addr;
+                        send_large_messages( queue, inst_id, control_msg_tmp, 0, NULL, LMSG_INIT_TAG);
+                        if (header_tmp->total_length <= ONE_SEG*seq) break;
+                      }
+                    }
+                }
+#if MULTI_THREAD_SEND
+                FreeControlMsg(header_tmp);
+#else
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+#endif
+                break;
+            }
+#if CMK_PERSISTENT_COMM_PUT && !REMOTE_EVENT && !CQWRITE
+            case PUT_DONE_TAG:  {   //persistent message
+                void *msg = (void *)(((CONTROL_MSG *) header)->source_addr);
+                int size = ((CONTROL_MSG *) header)->length;
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                CmiReference(msg);
+                CMI_CHECK_CHECKSUM(msg, size);
+                handleOneRecvedMsg(size, msg); 
+#if PRINT_SYH
+                printf("[%d] PUT_DONE_TAG hand over one message, size: %d. \n", myrank, size);
+#endif
+                break;
+            }
+#endif
+#if CMK_DIRECT
+            case DIRECT_PUT_DONE_TAG:  //cmi direct 
+                //create a trigger message
+                direct_msg = (cmidirectMsg*)CmiAlloc(sizeof(cmidirectMsg));
+                direct_msg->handler = ((CMK_DIRECT_HEADER*)header)->handler_addr;
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                CmiSetHandler(direct_msg, CpvAccess(CmiHandleDirectIdx));
+                CmiPushPE(((CmiDirectUserHandle*)direct_msg->handler)->remoteRank, direct_msg);
+                //(*(((CMK_DIRECT_HEADER*) header)->callbackFnPtr))(((CMK_DIRECT_HEADER*) header)->callbackData);
+                break;
+#endif
+            default:
+                GNI_SmsgRelease(ep_hndl_array[inst_id]);
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+                printf("weird tag problem %d \n", msg_tag);
+                CmiAbort("Unknown tag\n");
+            }               // end switch
+#if PRINT_SYH
+            printf("[%d] from %d after switch request for smsg is received, messageid: tag=%d\n", myrank, inst_id, msg_tag);
+#endif
+            smsg_recv_count ++;
+            msg_tag = GNI_SMSG_ANY_TAG;
+        } //endwhile GNI_SmsgGetNextWTag
+    }   //end while GetEvent
+    if(status == GNI_RC_ERROR_RESOURCE)
+    {
+        printf("charm> Please use +useRecvQueue %d in your command line, if the error comes again, increase this number\n", REMOTE_QUEUE_ENTRIES*2);
+        GNI_RC_CHECK("Smsg_rx_cq full", status);
+    }
+}
+
+static void printDesc(gni_post_descriptor_t *pd)
+{
+    printf(" Descriptor (%p===>%p)(%d)\n", pd->local_addr, pd->remote_addr, pd->length); 
+}
+
+#if CQWRITE
+static void sendCqWrite(int destNode, uint64_t data, gni_mem_handle_t mem_hndl)
+{
+    gni_post_descriptor_t *pd;
+    gni_return_t        status = GNI_RC_SUCCESS;
+    
+    MallocPostDesc(pd);
+    pd->type = GNI_POST_CQWRITE;
+    pd->cq_mode = GNI_CQMODE_SILENT;
+    //pd->cq_mode = GNI_CQMODE_GLOBAL_EVENT | GNI_CQMODE_REMOTE_EVENT ;
+    pd->dlvr_mode = GNI_DLVMODE_PERFORMANCE;
+    pd->cqwrite_value = data;
+    pd->remote_mem_hndl = mem_hndl;
+    status = GNI_PostCqWrite(ep_hndl_array[destNode], pd);
+    GNI_RC_CHECK("GNI_PostCqWrite", status);
+}
+#endif
+
+// register memory for a message
+// return mem handle
+static gni_return_t  registerMessage(void *msg, int size, int seqno, gni_mem_handle_t *memh)
+{
+    gni_return_t status = GNI_RC_SUCCESS;
+
+    if (!IsMemHndlZero(*memh)) return GNI_RC_SUCCESS;
+
+#if CMK_PERSISTENT_COMM_PUT
+      // persistent message is always registered
+      // BIG_MSG small pieces do not have malloc chunk header
+    if (IS_PERSISTENT_MEMORY(msg)) {
+        *memh = GetMemHndl(msg);
+        return GNI_RC_SUCCESS;
+    }
+#endif
+    if(seqno == 0 
+#if CMK_PERSISTENT_COMM_PUT
+         || seqno == PERSIST_SEQ
+#endif
+      )
+    {
+        if(IsMemHndlZero((GetMemHndl(msg))))
+        {
+            msg = (void*)(msg);
+            status = registerMemory(GetMempoolBlockPtr(msg), GetMempoolsize(msg), &(GetMemHndl(msg)), rdma_rx_cqh);
+            if(status == GNI_RC_SUCCESS)
+                *memh = GetMemHndl(msg);
+        }
+        else {
+            *memh = GetMemHndl(msg);
+        }
+    }
+    else {
+        //big msg, can not fit into memory pool, or CmiDirect Msg (which is not from mempool)
+        status = registerMemory(msg, size, memh, NULL); 
+    }
+    return status;
+}
+
+static void getPersistentMsgRequest(void* header, uint64_t inst_id, uint8_t tag, PCQueue bufferRdmaQueue )
+{
+#if   PERSISTENT_GET_BASE
+    CONTROL_MSG         *request_msg;
+    gni_return_t        status;
+    gni_post_descriptor_t *pd;
+    request_msg = (CONTROL_MSG *) header;
+
+    MallocPostDesc(pd);
+    pd->cqwrite_value = request_msg->seq_id;
+    pd->first_operand = ALIGN64(request_msg->length); //  total length
+    if(request_msg->length <= LRTS_GNI_RDMA_THRESHOLD) 
+        pd->type            = GNI_POST_FMA_GET;
+    else
+        pd->type            = GNI_POST_RDMA_GET;
+    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT ;
+    pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+    pd->length          = ALIGN64(request_msg->length);
+    pd->local_addr      = (uint64_t) request_msg->dest_addr;
+    pd->local_mem_hndl  = request_msg->dest_mem_hndl;
+    pd->remote_addr     = (uint64_t) request_msg->source_addr;
+    pd->remote_mem_hndl = request_msg->source_mem_hndl;
+    pd->src_cq_hndl     = 0;
+    pd->rdma_mode       = 0;
+    pd->amo_cmd         = 0;
+#if REMOTE_EVENT
+    bufferRdmaMsg(bufferRdmaQueue, inst_id, pd, request_msg->ack_index); 
+#else
+    bufferRdmaMsg(bufferRdmaQueue, inst_id, pd, -1); 
+#endif
+
+#endif
+}
+
+// for BIG_MSG called on receiver side for receiving control message
+// LMSG_INIT_TAG
+static void getLargeMsgRequest(void* header, uint64_t inst_id, uint8_t tag, PCQueue bufferRdmaQueue )
+{
+#if     USE_LRTS_MEMPOOL
+    CONTROL_MSG         *request_msg;
+    gni_return_t        status = GNI_RC_SUCCESS;
+    void                *msg_data;
+    gni_post_descriptor_t *pd;
+    gni_mem_handle_t    msg_mem_hndl;
+    int                 size, transaction_size, offset = 0;
+    size_t              register_size = 0;
+
+    // initial a get to transfer data from the sender side */
+    request_msg = (CONTROL_MSG *) header;
+    size = request_msg->total_length;
+    MACHSTATE4(8, "GO Get request from %d (%d,%d, %d) \n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size); 
+    MallocPostDesc(pd);
+#if CMK_WITH_STATS 
+    pd->sync_flag_addr = 1000000 * CmiWallTimer(); //microsecond
+#endif
+    if(request_msg->seq_id < 2)   {
+        MACHSTATE2(8, "%d seq id in get large msg requrest %d\n", CmiMyRank(), request_msg->seq_id);
+#if CMK_SMP_TRACE_COMMTHREAD 
+        pd->sync_flag_addr = 1000000 * CmiWallTimer(); //microsecond
+#endif
+        msg_data = CmiAlloc(size);
+        CmiSetMsgSeq(msg_data, 0);
+        _MEMCHECK(msg_data);
+    }
+    else {
+        offset = ONE_SEG*(request_msg->seq_id-1);
+        msg_data = (char*)request_msg->dest_addr + offset;
+    }
+   
+    pd->cqwrite_value = request_msg->seq_id;
+
+    transaction_size = request_msg->seq_id == 0? ALIGN64(size) : ALIGN64(request_msg->length);
+    SetMemHndlZero(pd->local_mem_hndl);
+    status = registerMessage(msg_data, transaction_size, request_msg->seq_id, &pd->local_mem_hndl);
+    if (status == GNI_RC_SUCCESS && request_msg->seq_id == 0) {
+        if(NoMsgInRecv( (void*)(msg_data)))
+            register_size = GetMempoolsize((void*)(msg_data));
+    }
+
+    pd->first_operand = ALIGN64(size);                   //  total length
+
+    if(request_msg->total_length <= LRTS_GNI_RDMA_THRESHOLD)
+        pd->type            = GNI_POST_FMA_GET;
+    else
+        pd->type            = GNI_POST_RDMA_GET;
+    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
+    pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+    pd->length          = transaction_size;
+    pd->local_addr      = (uint64_t) msg_data;
+    pd->remote_addr     = request_msg->source_addr + offset;
+    pd->remote_mem_hndl = request_msg->source_mem_hndl;
+
+    if (tag == LMSG_OOB_INIT_TAG) 
+        pd->src_cq_hndl     = highprior_rdma_tx_cqh;
+    else
+    {
+#if MULTI_THREAD_SEND
+        pd->src_cq_hndl     = rdma_tx_cqh;
+#else
+        pd->src_cq_hndl     = 0;
+#endif
+    }
+
+    pd->rdma_mode       = 0;
+    pd->amo_cmd         = 0;
+#if CMI_EXERT_RECV_RDMA_CAP
+    if(status == GNI_RC_SUCCESS && RDMA_pending >= RDMA_cap ) status = GNI_RC_ERROR_RESOURCE; 
+#endif
+    //memory registration success
+    if(status == GNI_RC_SUCCESS && tag == LMSG_OOB_INIT_TAG )
+    {
+        CmiNodeLock lock = pd->type == GNI_POST_RDMA_GET?rdma_tx_cq_lock:default_tx_cq_lock;
+        CMI_GNI_LOCK(lock)
+#if REMOTE_EVENT
+        if( request_msg->seq_id == 0)
+        {
+            pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+            int sts = GNI_EpSetEventData(ep_hndl_array[inst_id], inst_id, ACK_EVENT(request_msg->ack_index));
+            GNI_RC_CHECK("GNI_EpSetEventData", sts);
+        }
+#endif
+
+#if CMK_WITH_STATS
+        RDMA_TRY_SEND(pd->type)
+#endif
+        if(pd->type == GNI_POST_RDMA_GET) 
+        {
+            status = GNI_PostRdma(ep_hndl_array[inst_id], pd);
+        }
+        else
+        {
+            status = GNI_PostFma(ep_hndl_array[inst_id],  pd);
+        }
+        CMI_GNI_UNLOCK(lock)
+
+        if(status == GNI_RC_SUCCESS )
+        {
+#if CMI_EXERT_RECV_RDMA_CAP
+            RDMA_pending++;
+#endif
+            if(pd->cqwrite_value == 0)
+            {
+#if MACHINE_DEBUG_LOG
+                buffered_recv_msg += register_size;
+                MACHSTATE4(8, "GO request from %d (%d,%d, %d)\n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size); 
+#endif
+                IncreaseMsgInRecv(msg_data);
+#if CMK_SMP_TRACE_COMMTHREAD 
+                pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+#endif
+            }
+#if  CMK_WITH_STATS
+            pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+            RDMA_TRANS_INIT(pd->type, pd->sync_flag_addr/1000000.0)
+#endif
+        }
+    }else if (status != GNI_RC_SUCCESS)
+    {
+        SetMemHndlZero((pd->local_mem_hndl));
+    }
+        if(status == GNI_RC_ERROR_RESOURCE|| status == GNI_RC_ERROR_NOMEM || tag != LMSG_OOB_INIT_TAG)
+    {
+#if REMOTE_EVENT
+        bufferRdmaMsg(bufferRdmaQueue, inst_id, pd, request_msg->ack_index); 
+#else
+        bufferRdmaMsg(bufferRdmaQueue, inst_id, pd, -1); 
+#endif
+    }else if (status != GNI_RC_SUCCESS) {
+        // printf("source: %d pd:(%p,%p)(%p,%p) len:%d local:%x remote:%x\n", (int)inst_id, (pd->local_mem_hndl).qword1, (pd->local_mem_hndl).qword2, (pd->remote_mem_hndl).qword1, (pd->remote_mem_hndl).qword2, pd->length, pd->local_addr, pd->remote_addr);
+        GNI_RC_CHECK("GetLargeAFter posting", status);
+    }
+#else
+    CONTROL_MSG         *request_msg;
+    gni_return_t        status;
+    void                *msg_data;
+    gni_post_descriptor_t *pd;
+    RDMA_REQUEST        *rdma_request_msg;
+    gni_mem_handle_t    msg_mem_hndl;
+    //int source;
+    // initial a get to transfer data from the sender side */
+    request_msg = (CONTROL_MSG *) header;
+    msg_data = CmiAlloc(request_msg->length);
+    _MEMCHECK(msg_data);
+
+    MEMORY_REGISTER(onesided_hnd, nic_hndl, msg_data, request_msg->length, &msg_mem_hndl, &omdh, NULL,  status)
+
+    if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR) 
+    {
+        GNI_RC_CHECK("Invalid/permission Mem Register in post", status);
+    }
+
+    MallocPostDesc(pd);
+    if(request_msg->length <= LRTS_GNI_RDMA_THRESHOLD) 
+        pd->type            = GNI_POST_FMA_GET;
+    else
+        pd->type            = GNI_POST_RDMA_GET;
+    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;// |  GNI_CQMODE_REMOTE_EVENT;
+    pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+    pd->length          = ALIGN64(request_msg->length);
+    pd->local_addr      = (uint64_t) msg_data;
+    pd->remote_addr     = request_msg->source_addr;
+    pd->remote_mem_hndl = request_msg->source_mem_hndl;
+    if (tag == LMSG_OOB_INIT_TAG) 
+        pd->src_cq_hndl     = highprior_rdma_tx_cqh;
+    else
+    {
+#if MULTI_THREAD_SEND
+        pd->src_cq_hndl     = rdma_tx_cqh;
+#else
+        pd->src_cq_hndl     = 0;
+#endif
+    }
+    pd->rdma_mode       = 0;
+    pd->amo_cmd         = 0;
+
+    //memory registration successful
+    if(status == GNI_RC_SUCCESS)
+    {
+        pd->local_mem_hndl  = msg_mem_hndl;
+       
+        if(pd->type == GNI_POST_RDMA_GET) 
+        {
+            CMI_GNI_LOCK(rdma_tx_cq_lock)
+            status = GNI_PostRdma(ep_hndl_array[inst_id], pd);
+            CMI_GNI_UNLOCK(rdma_tx_cq_lock)
+        }
+        else
+        {
+            CMI_GNI_LOCK(default_tx_cq_lock)
+            status = GNI_PostFma(ep_hndl_array[inst_id],  pd);
+            CMI_GNI_UNLOCK(default_tx_cq_lock)
+        }
+
+    }else
+    {
+        SetMemHndlZero(pd->local_mem_hndl);
+    }
+    if(status == GNI_RC_ERROR_RESOURCE|| status == GNI_RC_ERROR_NOMEM )
+    {
+        MallocRdmaRequest(rdma_request_msg);
+        rdma_request_msg->next = 0;
+        rdma_request_msg->destNode = inst_id;
+        rdma_request_msg->pd = pd;
+        PCQueuePush(sendRdmaBuf, (char*)rdma_request_msg);
+    }else {
+        GNI_RC_CHECK("AFter posting", status);
+    }
+#endif
+}
+
+#if CQWRITE
+static void PumpCqWriteTransactions()
+{
+
+    gni_cq_entry_t          ev;
+    gni_return_t            status;
+    void                    *msg;  
+    int                     msg_size;
+    while(1) {
+        //CMI_GNI_LOCK(my_cq_lock) 
+        status = GNI_CqGetEvent(rdma_rx_cqh, &ev);
+        //CMI_GNI_UNLOCK(my_cq_lock)
+        if(status != GNI_RC_SUCCESS) break;
+        msg = (void*) ( GNI_CQ_GET_DATA(ev) & 0xFFFFFFFFFFFFL);
+#if CMK_PERSISTENT_COMM_PUT
+#if PRINT_SYH
+        printf(" %d CQ write event %p\n", myrank, msg);
+#endif
+        if (!IsMemHndlZero(MEMHFIELD(msg))) {
+#if PRINT_SYH
+            printf(" %d Persistent CQ write event %p\n", myrank, msg);
+#endif
+            CmiReference(msg);
+            msg_size = CmiGetMsgSize(msg);
+            CMI_CHECK_CHECKSUM(msg, msg_size);
+            handleOneRecvedMsg(msg_size, msg); 
+            continue;
+        }
+#endif
+#if ! USE_LRTS_MEMPOOL
+       // MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(((ACK_MSG *)header)->source_mem_hndl), &omdh, ((ACK_MSG *)header)->length);
+#else
+        DecreaseMsgInSend(msg);
+#endif
+        if(NoMsgInSend(msg))
+            buffered_send_msg -= GetMempoolsize(msg);
+        CmiFree(msg);
+    };
+    if(status == GNI_RC_ERROR_RESOURCE)
+    {
+        GNI_RC_CHECK("rdma_rx_cq full too many ack", status);
+    }
+}
+#endif
+
+#if REMOTE_EVENT
+static void PumpRemoteTransactions(gni_cq_handle_t rx_cqh)
+{
+    gni_cq_entry_t          ev;
+    gni_return_t            status;
+    void                    *msg;   
+    int                     inst_id, index, type, size;
+
+#if CMI_PUMPREMOTETRANSACTIONS_CAP
+    int                     pump_count = 0;
+#endif
+    while(1) {
+#if CMI_PUMPREMOTETRANSACTIONS_CAP
+        if (pump_count > PumpRemoteTransactions_cap) break;
+#endif
+        CMI_GNI_LOCK(global_gni_lock)
+//        CMI_GNI_LOCK(rdma_tx_cq_lock)
+        status = GNI_CqGetEvent(rx_cqh, &ev);
+//        CMI_GNI_UNLOCK(rdma_tx_cq_lock)
+        CMI_GNI_UNLOCK(global_gni_lock)
+
+        if(status != GNI_RC_SUCCESS) break;
+
+#if CMI_PUMPREMOTETRANSACTIONS_CAP
+        pump_count ++;
+#endif
+
+        inst_id = GNI_CQ_GET_INST_ID(ev);
+        index = GET_INDEX(inst_id);
+        type = GET_TYPE(inst_id);
+        switch (type) {
+        case 0:    // ACK
+            CmiAssert(index>=0 && index<ackPool.size);
+            CMI_GNI_LOCK(ackPool.lock);
+            //CmiAssert(GetIndexType(ackPool, index) == 1);
+            msg = GetIndexAddress(ackPool, index);
+            CMI_GNI_UNLOCK(ackPool.lock);
+#if PRINT_SYH
+            MACHSTATE4(8,"[%d] PumpRemoteTransactions: ack: %lld index: %d type: %d.\n", myrank, msg, index, type);
+#endif
+#if ! USE_LRTS_MEMPOOL
+           // MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(((ACK_MSG *)header)->source_mem_hndl), &omdh, ((ACK_MSG *)header)->length);
+#else
+            DecreaseMsgInSend(msg);
+#endif
+            if(NoMsgInSend(msg))
+                buffered_send_msg -= GetMempoolsize(msg);
+            CmiFree(msg);
+            IndexPool_freeslot(&ackPool, index);
+#if CMI_EXERT_SEND_LARGE_CAP
+            SEND_large_pending--;
+#endif
+            break;
+#if CMK_PERSISTENT_COMM_PUT
+        case 1:  {    // PERSISTENT
+            CmiLock(persistPool.lock);
+            CmiAssert(GetIndexType(persistPool, index) == 2);
+            PersistentReceivesTable *slot = GetIndexAddress(persistPool, index);
+            CmiUnlock(persistPool.lock);
+            START_EVENT();
+            msg = slot->destBuf[slot->addrIndex].destAddress;
+            size = CmiGetMsgSize(msg);
+            CmiReference(msg);
+            CMI_CHECK_CHECKSUM(msg, size);
+            TRACE_COMM_CREATION(EVENT_TIME(), msg);
+            handleOneRecvedMsg(size, msg); 
+            break;
+            }
+#endif
+        default:
+            fprintf(stderr, "[%d] PumpRemoteTransactions: unknown type: %d\n", myrank, type);
+            CmiAbort("PumpRemoteTransactions: unknown type");
+        }
+    }
+    if(status == GNI_RC_ERROR_RESOURCE)
+    {
+        GNI_RC_CHECK("rdma_rx_cq full too many ack", status);
+    }
+}
+#endif
+
+static void PumpLocalTransactions(gni_cq_handle_t my_tx_cqh, CmiNodeLock my_cq_lock)
+{
+    gni_cq_entry_t          ev;
+    gni_return_t            status;
+    uint64_t                type, inst_id;
+    gni_post_descriptor_t   *tmp_pd;
+    MSG_LIST                *ptr;
+    CONTROL_MSG             *ack_msg_tmp;
+    ACK_MSG                 *ack_msg;
+    uint8_t                 msg_tag;
+#if CMK_DIRECT
+    CMK_DIRECT_HEADER       *cmk_direct_done_msg;
+#endif
+    SMSG_QUEUE         *queue = &smsg_queue;
+#if CMI_PUMPLOCALTRANSACTIONS_CAP
+    int         pump_count = 0;
+    while(pump_count < PumpLocalTransactions_cap) {
+        pump_count++;
+#else
+    while(1) {
+#endif
+        CMI_GNI_LOCK(my_cq_lock) 
+        status = GNI_CqGetEvent(my_tx_cqh, &ev);
+        CMI_GNI_UNLOCK(my_cq_lock)
+        if(status != GNI_RC_SUCCESS) break;
+        
+        type = GNI_CQ_GET_TYPE(ev);
+        if (type == GNI_CQ_EVENT_TYPE_POST)
+        {
+
+#if CMI_EXERT_RECV_RDMA_CAP
+            if(RDMA_pending <=0) CmiAbort(" pending error\n");
+            RDMA_pending--;
+#endif
+            inst_id     = GNI_CQ_GET_INST_ID(ev);
+#if PRINT_SYH
+            printf("[%d] LocalTransactions localdone=%d\n", myrank,  lrts_local_done_msg);
+#endif
+            CMI_GNI_LOCK(my_cq_lock)
+            status = GNI_GetCompleted(my_tx_cqh, ev, &tmp_pd);
+            CMI_GNI_UNLOCK(my_cq_lock)
+
+            switch (tmp_pd->type) {
+#if CMK_PERSISTENT_COMM_PUT  || CMK_DIRECT
+            case GNI_POST_RDMA_PUT:
+#if CMK_PERSISTENT_COMM_PUT && ! USE_LRTS_MEMPOOL
+                MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &tmp_pd->local_mem_hndl, &omdh, tmp_pd->length);
+#endif
+            case GNI_POST_FMA_PUT:
+                if(tmp_pd->amo_cmd == 1) {
+#if CMK_DIRECT
+                    //sender ACK to receiver to trigger it is done
+                    cmk_direct_done_msg = (CMK_DIRECT_HEADER*) malloc(sizeof(CMK_DIRECT_HEADER));
+                    cmk_direct_done_msg->handler_addr = tmp_pd->first_operand;
+                    msg_tag = DIRECT_PUT_DONE_TAG;
+#endif
+                }
+                else {
+                    CmiFree((void *)tmp_pd->local_addr);
+#if REMOTE_EVENT
+                    FreePostDesc(tmp_pd);
+                    continue;
+#elif CQWRITE
+                    sendCqWrite(inst_id, tmp_pd->remote_addr, tmp_pd->remote_mem_hndl);
+                    FreePostDesc(tmp_pd);
+                    continue;
+#else
+                    MallocControlMsg(ack_msg_tmp);
+                    ack_msg_tmp->source_addr = tmp_pd->remote_addr;
+                    ack_msg_tmp->source_mem_hndl    = tmp_pd->remote_mem_hndl;
+                    ack_msg_tmp->length  = tmp_pd->length;
+                    msg_tag = PUT_DONE_TAG;
+#endif
+                }
+                break;
+#endif
+            case GNI_POST_RDMA_GET:
+            case GNI_POST_FMA_GET:  {
+                MACHSTATE2(8, "PumpLocal Get done %lld=>%lld\n", tmp_pd->local_addr, tmp_pd->remote_addr);
+#if  ! USE_LRTS_MEMPOOL
+                MallocControlMsg(ack_msg_tmp);
+                ack_msg_tmp->source_addr = tmp_pd->remote_addr;
+                ack_msg_tmp->source_mem_hndl    = tmp_pd->remote_mem_hndl;
+                MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &tmp_pd->local_mem_hndl, &omdh, tmp_pd->length)
+                msg_tag = ACK_TAG;  
+#else
+#if CMK_WITH_STATS
+                RDMA_TRANS_DONE(tmp_pd->sync_flag_value/1000000.0)
+#endif
+                int seq_id = tmp_pd->cqwrite_value;
+                if(seq_id > 0)      // BIG_MSG
+                {
+                    MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &tmp_pd->local_mem_hndl, &omdh, tmp_pd->length);
+                    MallocControlMsg(ack_msg_tmp);
+                    ack_msg_tmp->source_addr = tmp_pd->remote_addr;
+                    ack_msg_tmp->source_mem_hndl    = tmp_pd->remote_mem_hndl;
+                    ack_msg_tmp->seq_id = seq_id;
+                    ack_msg_tmp->dest_addr = tmp_pd->local_addr - ONE_SEG*(ack_msg_tmp->seq_id-1);
+                    ack_msg_tmp->source_addr -= ONE_SEG*(ack_msg_tmp->seq_id-1);
+                    ack_msg_tmp->length = tmp_pd->length;
+                    ack_msg_tmp->total_length = tmp_pd->first_operand;     // total size
+                    msg_tag = BIG_MSG_TAG; 
+                } 
+                else
+                {
+                    if(seq_id < 0)
+                        CmiReference((void*)tmp_pd->local_addr);
+                    msg_tag = ACK_TAG; 
+#if  !REMOTE_EVENT && !CQWRITE
+                    MallocAckMsg(ack_msg);
+                    ack_msg->source_addr = tmp_pd->remote_addr;
+#endif
+                }
+#endif
+                break;
+            }
+            case  GNI_POST_CQWRITE:
+                   FreePostDesc(tmp_pd);
+                   continue;
+            default:
+                CmiPrintf("type=%d\n", tmp_pd->type);
+                CmiAbort("PumpLocalTransactions: unknown type!");
+            }      /* end of switch */
+
+#if CMK_DIRECT
+            if (tmp_pd->amo_cmd == 1) {
+                status = send_smsg_message(queue, inst_id, cmk_direct_done_msg, sizeof(CMK_DIRECT_HEADER), msg_tag, 0, NULL); 
+                if (status == GNI_RC_SUCCESS) free(cmk_direct_done_msg); 
+            }
+            else
+#endif
+            if (msg_tag == ACK_TAG) {
+#if !REMOTE_EVENT
+#if   !CQWRITE
+                status = send_smsg_message(queue, inst_id, ack_msg, ACK_MSG_SIZE, msg_tag, 0, NULL); 
+                if (status == GNI_RC_SUCCESS) FreeAckMsg(ack_msg);
+#else
+                sendCqWrite(inst_id, tmp_pd->remote_addr, tmp_pd->remote_mem_hndl); 
+#endif
+#endif
+            }
+            else {
+                status = send_smsg_message(queue, inst_id, ack_msg_tmp, CONTROL_MSG_SIZE, msg_tag, 0, NULL); 
+                if (status == GNI_RC_SUCCESS) FreeControlMsg(ack_msg_tmp);
+            }
+#if CMK_PERSISTENT_COMM_PUT
+            if (tmp_pd->type == GNI_POST_RDMA_GET || tmp_pd->type == GNI_POST_FMA_GET)
+#endif
+            {
+                if( msg_tag == ACK_TAG){    //msg fit in mempool 
+#if PRINT_SYH
+                    printf("PumpLocalTransactions: Normal msg transaction PE:%d==>%d\n", myrank, inst_id);
+#endif
+                    TRACE_COMM_CONTROL_CREATION((double)(tmp_pd->sync_flag_addr/1000000.0), (double)((tmp_pd->sync_flag_addr+1)/1000000.0), (double)((tmp_pd->sync_flag_addr+1)/1000000.0), (void*)tmp_pd->local_addr); 
+                    TRACE_COMM_CONTROL_CREATION((double)(tmp_pd->sync_flag_value/1000000.0), (double)((tmp_pd->sync_flag_value+1)/1000000.0), (double)((tmp_pd->sync_flag_value+1)/1000000.0), (void*)tmp_pd->local_addr); 
+
+                    START_EVENT();
+                    //CmiAssert(SIZEFIELD((void*)(tmp_pd->local_addr)) <= tmp_pd->length);
+                    DecreaseMsgInRecv((void*)tmp_pd->local_addr);
+#if MACHINE_DEBUG_LOG
+                    if(NoMsgInRecv((void*)(tmp_pd->local_addr)))
+                        buffered_recv_msg -= GetMempoolsize((void*)(tmp_pd->local_addr));
+                    MACHSTATE5(8, "GO Recv done ack send from %d (%d,%d, %d) tag=%d\n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size, msg_tag); 
+#endif
+                    TRACE_COMM_CREATION(EVENT_TIME(), (void*)tmp_pd->local_addr);
+                    CMI_CHECK_CHECKSUM((void*)tmp_pd->local_addr, tmp_pd->length);
+                    handleOneRecvedMsg(tmp_pd->length, (void*)tmp_pd->local_addr); 
+                }else if(msg_tag == BIG_MSG_TAG){
+                    void *msg = (char*)tmp_pd->local_addr-(tmp_pd->cqwrite_value-1)*ONE_SEG;
+                    CmiSetMsgSeq(msg, CmiGetMsgSeq(msg)+1);
+                    if (tmp_pd->first_operand <= ONE_SEG*CmiGetMsgSeq(msg)) {
+                        START_EVENT();
+#if PRINT_SYH
+                        printf("Pipeline msg done [%d]\n", myrank);
+#endif
+#if     CMK_SMP_TRACE_COMMTHREAD
+                        if( tmp_pd->cqwrite_value == 1)
+                            TRACE_COMM_CONTROL_CREATION((double)(tmp_pd->sync_flag_addr/1000000.0), (double)((tmp_pd->sync_flag_addr+1)/1000000.0), (double)((tmp_pd->sync_flag_addr+2)/1000000.0), (void*)tmp_pd->local_addr); 
+#endif
+                        TRACE_COMM_CREATION(EVENT_TIME(), msg);
+                        CMI_CHECK_CHECKSUM(msg, tmp_pd->first_operand);
+                        handleOneRecvedMsg(tmp_pd->first_operand, msg); 
+                    }
+                }
+            }
+            FreePostDesc(tmp_pd);
+        }
+    } //end while
+    if(status == GNI_RC_ERROR_RESOURCE)
+    {
+        printf("charm> Please use +useSendQueue 204800 in your command line, if the error comes again, increase this number\n");  
+        GNI_RC_CHECK("Smsg_tx_cq full", status);
+    }
+}
+
+static void  SendRdmaMsg( BufferList sendqueue)
+{
+    gni_return_t            status = GNI_RC_SUCCESS;
+    gni_mem_handle_t        msg_mem_hndl;
+    RDMA_REQUEST            *ptr = 0, *tmp_ptr;
+    RDMA_REQUEST            *pre = 0;
+    uint64_t                register_size = 0;
+    void                    *msg;
+    int                     i;
+
+    int len = PCQueueLength(sendqueue);
+    for (i=0; i<len; i++)
+    {
+#if CMI_EXERT_RECV_RDMA_CAP
+        if( RDMA_pending >= RDMA_cap) break;
+#endif
+        CMI_PCQUEUEPOP_LOCK( sendqueue)
+        ptr = (RDMA_REQUEST*)PCQueuePop(sendqueue);
+        CMI_PCQUEUEPOP_UNLOCK( sendqueue)
+        if (ptr == NULL) break;
+        
+        gni_post_descriptor_t *pd = ptr->pd;
+        
+        msg = (void*)(pd->local_addr);
+        status = registerMessage(msg, pd->length, pd->cqwrite_value, &pd->local_mem_hndl);
+        register_size = 0;
+        if(pd->cqwrite_value == 0) {
+            if(NoMsgInRecv(msg))
+                register_size = GetMempoolsize(msg);
+        }
+
+        if(status == GNI_RC_SUCCESS)        //mem register good
+        {
+            int destNode = ptr->destNode;
+            CmiNodeLock lock = (pd->type == GNI_POST_RDMA_GET || pd->type == GNI_POST_RDMA_PUT) ? rdma_tx_cq_lock:default_tx_cq_lock;
+            CMI_GNI_LOCK(lock);
+#if REMOTE_EVENT
+            if( pd->cqwrite_value == 0 || pd->cqwrite_value == -1) {
+                pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+                int sts = GNI_EpSetEventData(ep_hndl_array[destNode], destNode, ACK_EVENT(ptr->ack_index));
+                GNI_RC_CHECK("GNI_EpSetEventData", sts);
+            }
+#if CMK_PERSISTENT_COMM_PUT
+            else if (pd->cqwrite_value == PERSIST_SEQ) {
+                pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+                int sts = GNI_EpSetEventData(ep_hndl_array[destNode], destNode, PERSIST_EVENT(ptr->ack_index));
+                GNI_RC_CHECK("GNI_EpSetEventData", sts);
+            }
+#endif
+#if CMK_DIRECT
+            else if (pd->cqwrite_value == DIRECT_SEQ) {
+                pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+                int sts = GNI_EpSetEventData(ep_hndl_array[destNode], destNode, DIRECT_EVENT(ptr->ack_index));
+                GNI_RC_CHECK("GNI_EpSetEventData", sts);
+            }
+#endif
+
+#endif
+#if CMK_WITH_STATS
+            RDMA_TRY_SEND(pd->type)
+#endif
+#if CMK_SMP_TRACE_COMMTHREAD
+            if(IS_PUT(pd->type))
+            {
+                 START_EVENT();
+                 TRACE_COMM_CREATION(EVENT_TIME(), (void*)pd->local_addr);//based on assumption, post always succeeds on first try
+            }
+#endif
+            if(pd->type == GNI_POST_RDMA_GET || pd->type == GNI_POST_RDMA_PUT) 
+            {
+                status = GNI_PostRdma(ep_hndl_array[destNode], pd);
+            }
+            else
+            {
+                status = GNI_PostFma(ep_hndl_array[destNode],  pd);
+            }
+            CMI_GNI_UNLOCK(lock);
+            
+            if(status == GNI_RC_SUCCESS)    //post good
+            {
+                MACHSTATE4(8, "post noempty-rdma  %d (%lld==%lld,%d) \n", ptr->destNode, pd->local_addr, pd->remote_addr,  register_memory_size); 
+#if CMI_EXERT_RECV_RDMA_CAP
+                RDMA_pending ++;
+#endif
+                if(pd->cqwrite_value <= 0)
+                {
+#if CMK_SMP_TRACE_COMMTHREAD 
+                    pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+#endif
+                    IncreaseMsgInRecv(((void*)(pd->local_addr)));
+                }
+#if  CMK_WITH_STATS
+                pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+                RDMA_TRANS_INIT(pd->type, pd->sync_flag_addr/1000000.0)
+#endif
+#if MACHINE_DEBUG_LOG
+                buffered_recv_msg += register_size;
+                MACHSTATE(8, "GO request from buffered\n"); 
+#endif
+#if PRINT_SYH
+                printf("[%d] SendRdmaMsg: post succeed. seqno: %d\n", myrank, pd->cqwrite_value);
+#endif
+                FreeRdmaRequest(ptr);
+            }else           // cannot post
+            {
+                PCQueuePush(sendRdmaBuf, (char*)ptr);
+#if PRINT_SYH
+                printf("[%d] SendRdmaMsg: post failed. seqno: %x dest: %d local mhdl: %lld %lld remote mhdl: %lld %lld connect: %d\n", myrank, pd->cqwrite_value, destNode, pd->local_mem_hndl.qword1, pd->local_mem_hndl.qword2, pd->remote_mem_hndl.qword1, pd->remote_mem_hndl.qword2, smsg_connected_flag[destNode]);
+#endif
+                break;
+            }
+        } else          //memory registration fails
+        {
+            PCQueuePush(sendqueue, (char*)ptr);
+        }
+    } //end while
+}
+
+static 
+inline gni_return_t _sendOneBufferedSmsg(SMSG_QUEUE *queue, MSG_LIST *ptr)
+{
+    CONTROL_MSG         *control_msg_tmp;
+    gni_return_t        status = GNI_RC_ERROR_RESOURCE;
+
+    MACHSTATE5(8, "noempty-smsg  %d (%d,%d,%d) tag=%d \n", ptr->destNode, buffered_send_msg, buffered_recv_msg, register_memory_size, ptr->tag); 
+    if (useDynamicSMSG && smsg_connected_flag[ptr->destNode] != 2) {   
+            /* connection not exists yet */
+#if CMK_SMP
+            /* non-smp case, connect is issued in send_smsg_message */
+        if (smsg_connected_flag[ptr->destNode] == 0)
+            connect_to(ptr->destNode); 
+#endif
+    }
+    else
+    switch(ptr->tag)
+    {
+    case SMALL_DATA_TAG:
+        status = send_smsg_message(queue, ptr->destNode,  ptr->msg, ptr->size, ptr->tag, 1, ptr);  
+        if(status == GNI_RC_SUCCESS)
+        {
+            CmiFree(ptr->msg);
+        }
+        break;
+    case LMSG_PERSISTENT_INIT_TAG:
+    case LMSG_INIT_TAG:
+    case LMSG_OOB_INIT_TAG:
+        control_msg_tmp = (CONTROL_MSG*)ptr->msg;
+        status = send_large_messages(queue, ptr->destNode, control_msg_tmp, 1, ptr, ptr->tag);
+        break;
+#if !REMOTE_EVENT && !CQWRITE
+    case ACK_TAG:
+        status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1, ptr);  
+        if(status == GNI_RC_SUCCESS) FreeAckMsg((ACK_MSG*)ptr->msg);
+        break;
+#endif
+    case BIG_MSG_TAG:
+        status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1, ptr);  
+        if(status == GNI_RC_SUCCESS)
+        {
+            FreeControlMsg((CONTROL_MSG*)ptr->msg);
+        }
+        break;
+#if CMK_PERSISTENT_COMM_PUT && !REMOTE_EVENT && !CQWRITE 
+    case PUT_DONE_TAG:
+        status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1, ptr);  
+        if(status == GNI_RC_SUCCESS)
+        {
+            FreeControlMsg((CONTROL_MSG*)ptr->msg);
+        }
+        break;
+#endif
+#if CMK_DIRECT
+    case DIRECT_PUT_DONE_TAG:
+        status = send_smsg_message(queue, ptr->destNode, ptr->msg, sizeof(CMK_DIRECT_HEADER), ptr->tag, 1, ptr);  
+        if(status == GNI_RC_SUCCESS)
+        {
+            free((CMK_DIRECT_HEADER*)ptr->msg);
+        }
+        break;
+#endif
+    default:
+        printf("Weird tag\n");
+        CmiAbort("should not happen\n");
+    }       // end switch
+    return status;
+}
+
+// return 1 if all messages are sent
+
+#if ONE_SEND_QUEUE
+
+static int SendBufferMsg(SMSG_QUEUE *queue, SMSG_QUEUE *prio_queue)
+{
+    MSG_LIST            *ptr, *tmp_ptr, *pre=0, *current_head;
+    CONTROL_MSG         *control_msg_tmp;
+    gni_return_t        status;
+    int                 done = 1;
+    uint64_t            register_size;
+    void                *register_addr;
+    int                 index_previous = -1;
+#if     CMI_SENDBUFFERSMSG_CAP
+    int                 sent_length = 0;
+#endif
+    int          index = 0;
+    memset(destpe_avail, 0, mysize * sizeof(char));
+    for (index=0; index<1; index++)
+    {
+        int i, len = PCQueueLength(queue->sendMsgBuf);
+        for (i=0; i<len; i++) 
+        {
+            CMI_PCQUEUEPOP_LOCK(queue->sendMsgBuf)
+            ptr = (MSG_LIST*)PCQueuePop(queue->sendMsgBuf);
+            CMI_PCQUEUEPOP_UNLOCK(queue->sendMsgBuf)
+            if(ptr == NULL) break;
+            if (destpe_avail[ptr->destNode] == 1) {       /* can't send to this pe */
+                PCQueuePush(queue->sendMsgBuf, (char*)ptr);
+                continue;
+            }
+            status = _sendOneBufferedSmsg(queue, ptr);
+#if CMI_SENDBUFFERSMSG_CAP
+            sent_length++;
+#endif
+            if(status == GNI_RC_SUCCESS)
+            {
+#if PRINT_SYH
+                buffered_smsg_counter--;
+                printf("[%d==>%d] buffered smsg sending done\n", myrank, ptr->destNode);
+#endif
+                FreeMsgList(ptr);
+            }else {
+                PCQueuePush(queue->sendMsgBuf, (char*)ptr);
+                done = 0;
+                if(status == GNI_RC_ERROR_RESOURCE)
+                {
+                    destpe_avail[ptr->destNode] = 1;
+                }
+            } 
+        } //end while
+    }   // end pooling for all cores
+    return done;
+}
+
+#else   /*  ! ONE_SEND_QUEUE  */
+
+static int SendBufferMsg(SMSG_QUEUE *queue, SMSG_QUEUE *prio_queue)
+{
+    MSG_LIST            *ptr;
+    gni_return_t        status;
+    int                 done = 1;
+#if     CMI_SENDBUFFERSMSG_CAP
+    int                 sent_length = 0;
+#endif
+    int idx;
+#if SMP_LOCKS
+    int          index = -1;
+    int nonempty = PCQueueLength(queue->nonEmptyQueues);
+    for(idx =0; idx<nonempty; idx++) 
+    {
+        index++;  if (index >= nonempty) index = 0;
+#if CMI_SENDBUFFERSMSG_CAP
+        if ( sent_length >= SendBufferMsg_cap) { done = 0; return done;}
+#endif
+        CMI_PCQUEUEPOP_LOCK(queue->nonEmptyQueues)
+        MSG_LIST_INDEX *current_list = (MSG_LIST_INDEX *)PCQueuePop(queue->nonEmptyQueues);
+        CMI_PCQUEUEPOP_UNLOCK(queue->nonEmptyQueues)
+        if(current_list == NULL) break; 
+        if (prio_queue && PCQueueLength(prio_queue->smsg_msglist_index[current_list->destpe].sendSmsgBuf) != 0) {
+            PCQueuePush(queue->nonEmptyQueues, (char*)current_list);
+            continue;
+        }
+        PCQueue current_queue= current_list->sendSmsgBuf;
+        CmiLock(current_list->lock);
+        int i, len = PCQueueLength(current_queue);
+        current_list->pushed = 0;
+        CmiUnlock(current_list->lock);
+#else      /* ! SMP_LOCKS */
+    static int          index = -1;
+    for(idx =0; idx<mysize; idx++) 
+    {
+        index++;  if (index == mysize) index = 0;
+#if CMI_SENDBUFFERSMSG_CAP
+        if ( sent_length >= SendBufferMsg_cap) { done = 0; return done;}
+#endif
+        if (prio_queue && PCQueueLength(prio_queue->smsg_msglist_index[index].sendSmsgBuf) != 0) continue;             // check urgent queue
+        //if (index == myrank) continue;
+        PCQueue current_queue = queue->smsg_msglist_index[index].sendSmsgBuf;
+        int i, len = PCQueueLength(current_queue);
+#endif
+        for (i=0; i<len; i++)  {
+            CMI_PCQUEUEPOP_LOCK(current_queue)
+            ptr = (MSG_LIST*)PCQueuePop(current_queue);
+            CMI_PCQUEUEPOP_UNLOCK(current_queue)
+            if (ptr == 0) break;
+
+            status = _sendOneBufferedSmsg(queue, ptr);
+#if CMI_SENDBUFFERSMSG_CAP
+            sent_length++;
+#endif
+            if(status == GNI_RC_SUCCESS)
+            {
+#if PRINT_SYH
+                buffered_smsg_counter--;
+                printf("[%d==>%d] buffered smsg sending done\n", myrank, ptr->destNode);
+#endif
+                FreeMsgList(ptr);
+            }else {
+                PCQueuePush(current_queue, (char*)ptr);
+                done = 0;
+                if(status == GNI_RC_ERROR_RESOURCE)
+                {
+                    break;
+                }
+            } 
+        } //end for i
+#if SMP_LOCKS
+        CmiLock(current_list->lock);
+        if(!PCQueueEmpty(current_queue) && current_list->pushed == 0)
+        {
+            current_list->pushed = 1;
+            PCQueuePush(queue->nonEmptyQueues, (char*)current_list);
+        }
+        CmiUnlock(current_list->lock); 
+#endif
+    }   // end pooling for all cores
+    return done;
+}
+
+#endif
+
+static void ProcessDeadlock();
+void LrtsAdvanceCommunication(int whileidle)
+{
+    static int count = 0;
+    /*  Receive Msg first */
+#if CMK_SMP_TRACE_COMMTHREAD
+    double startT, endT;
+#endif
+    if (useDynamicSMSG && whileidle)
+    {
+#if CMK_SMP_TRACE_COMMTHREAD
+        startT = CmiWallTimer();
+#endif
+        STATS_PUMPDATAGRAMCONNECTION_TIME(PumpDatagramConnection());
+#if CMK_SMP_TRACE_COMMTHREAD
+        endT = CmiWallTimer();
+        if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_SetupConnect, startT, endT);
+#endif
+    }
+
+    SEND_OOB_SMSG(smsg_oob_queue)
+    PUMP_REMOTE_HIGHPRIORITY
+    PUMP_LOCAL_HIGHPRIORITY
+    POST_HIGHPRIORITY_RDMA
+    // Receiving small messages and persistent
+#if CMK_SMP_TRACE_COMMTHREAD
+    startT = CmiWallTimer();
+#endif
+    STATS_PUMPNETWORK_TIME(PumpNetworkSmsg());
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_PumpSmsg, startT, endT);
+#endif
+
+    SEND_OOB_SMSG(smsg_oob_queue)
+    PUMP_REMOTE_HIGHPRIORITY
+    PUMP_LOCAL_HIGHPRIORITY
+    POST_HIGHPRIORITY_RDMA
+    
+    ///* Send buffered Message */
+#if CMK_SMP_TRACE_COMMTHREAD
+    startT = CmiWallTimer();
+#endif
+#if CMK_USE_OOB
+    STATS_SEND_SMSGS_TIME(SendBufferMsg(&smsg_queue, &smsg_oob_queue));
+#else
+    STATS_SEND_SMSGS_TIME(SendBufferMsg(&smsg_queue, NULL));
+#endif
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_SendBufferSmsg, startT, endT);
+#endif
+
+    SEND_OOB_SMSG(smsg_oob_queue)
+    PUMP_REMOTE_HIGHPRIORITY
+    PUMP_LOCAL_HIGHPRIORITY
+    POST_HIGHPRIORITY_RDMA
+
+    //Pump Get messages or PUT messages
+#if CMK_SMP_TRACE_COMMTHREAD
+    startT = CmiWallTimer();
+#endif
+    PumpLocalTransactions(default_tx_cqh, default_tx_cq_lock);
+#if MULTI_THREAD_SEND
+    STATS_PUMPLOCALTRANSACTIONS_RDMA_TIME(PumpLocalTransactions(rdma_tx_cqh,  rdma_tx_cq_lock));
+#endif
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_PumpTransaction, startT, endT);
+#endif
+    
+    SEND_OOB_SMSG(smsg_oob_queue)
+    PUMP_REMOTE_HIGHPRIORITY
+    PUMP_LOCAL_HIGHPRIORITY
+    POST_HIGHPRIORITY_RDMA
+    //Pump Remote event
+#if CMK_SMP_TRACE_COMMTHREAD
+    startT = CmiWallTimer();
+#endif
+#if CQWRITE
+    PumpCqWriteTransactions();
+#endif
+#if REMOTE_EVENT
+    STATS_PUMPREMOTETRANSACTIONS_TIME(PumpRemoteTransactions(rdma_rx_cqh));
+#endif
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_PumpRdmaTransaction, startT, endT);
+#endif
+
+    SEND_OOB_SMSG(smsg_oob_queue)
+    PUMP_REMOTE_HIGHPRIORITY
+    PUMP_LOCAL_HIGHPRIORITY
+    POST_HIGHPRIORITY_RDMA
+
+#if CMK_SMP_TRACE_COMMTHREAD
+    startT = CmiWallTimer();
+#endif
+    STATS_SENDRDMAMSG_TIME(SendRdmaMsg(sendRdmaBuf));
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_SendFmaRdmaMsg, startT, endT);
+#endif
+
+#if CMK_SMP && ! LARGEPAGE
+    if (_detected_hang)  ProcessDeadlock();
+#endif
+}
+
+static void set_smsg_max()
+{
+    char *env;
+
+    if(mysize <=512)
+    {
+        SMSG_MAX_MSG = 1024;
+    }else if (mysize <= 4096)
+    {
+        SMSG_MAX_MSG = 1024;
+    }else if (mysize <= 16384)
+    {
+        SMSG_MAX_MSG = 512;
+    }else {
+        SMSG_MAX_MSG = 256;
+    }
+
+    env = getenv("CHARM_UGNI_SMSG_MAX_SIZE");
+    if (env) SMSG_MAX_MSG = atoi(env);
+    CmiAssert(SMSG_MAX_MSG > 0);
+}    
+
+/* useDynamicSMSG */
+static void _init_dynamic_smsg()
+{
+    gni_return_t status;
+    uint32_t     vmdh_index = -1;
+    int i;
+
+    smsg_attr_vector_local = (gni_smsg_attr_t**)malloc(mysize * sizeof(gni_smsg_attr_t*));
+    smsg_attr_vector_remote = (gni_smsg_attr_t**)malloc(mysize * sizeof(gni_smsg_attr_t*));
+    smsg_connected_flag = (int*)malloc(sizeof(int)*mysize);
+    for(i=0; i<mysize; i++) {
+        smsg_connected_flag[i] = 0;
+        smsg_attr_vector_local[i] = NULL;
+        smsg_attr_vector_remote[i] = NULL;
+    }
+
+    set_smsg_max();
+
+    send_smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+    send_smsg_attr.mbox_maxcredit = SMSG_MAX_CREDIT;
+    send_smsg_attr.msg_maxsize = SMSG_MAX_MSG;
+    status = GNI_SmsgBufferSizeNeeded(&send_smsg_attr, &smsg_memlen);
+    GNI_RC_CHECK("GNI_GNI_MemRegister mem buffer", status);
+
+    mailbox_list = (dynamic_smsg_mailbox_t*)malloc(sizeof(dynamic_smsg_mailbox_t));
+    mailbox_list->size = smsg_memlen*avg_smsg_connection;
+    posix_memalign(&mailbox_list->mailbox_base, 64, mailbox_list->size);
+    bzero(mailbox_list->mailbox_base, mailbox_list->size);
+    mailbox_list->offset = 0;
+    mailbox_list->next = 0;
+    
+    status = GNI_MemRegister(nic_hndl, (uint64_t)(mailbox_list->mailbox_base),
+        mailbox_list->size, smsg_rx_cqh,
+        GNI_MEM_READWRITE,   
+        vmdh_index,
+        &(mailbox_list->mem_hndl));
+    GNI_RC_CHECK("MEMORY registration for smsg", status);
+
+    status = GNI_EpCreate(nic_hndl, default_tx_cqh, &ep_hndl_unbound);
+    GNI_RC_CHECK("Unbound EP", status);
+    
+    alloc_smsg_attr(&send_smsg_attr);
+
+    status = GNI_EpPostDataWId (ep_hndl_unbound, &send_smsg_attr,  SMSG_ATTR_SIZE, &recv_smsg_attr, SMSG_ATTR_SIZE, myrank);
+    GNI_RC_CHECK("post unbound datagram", status);
+
+      /* always pre-connect to proc 0 */
+    //if (myrank != 0) connect_to(0);
+
+    status = GNI_SmsgSetMaxRetrans(nic_hndl, 4096);
+    GNI_RC_CHECK("SmsgSetMaxRetrans Init", status);
+}
+
+static void _init_static_smsg()
+{
+    gni_smsg_attr_t      *smsg_attr;
+    gni_smsg_attr_t      remote_smsg_attr;
+    gni_smsg_attr_t      *smsg_attr_vec;
+    gni_mem_handle_t     my_smsg_mdh_mailbox;
+    int      ret, i;
+    gni_return_t status;
+    uint32_t              vmdh_index = -1;
+    mdh_addr_t            base_infor;
+    mdh_addr_t            *base_addr_vec;
+
+    set_smsg_max();
+    
+    smsg_attr = malloc(mysize * sizeof(gni_smsg_attr_t));
+    
+    smsg_attr[0].msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+    smsg_attr[0].mbox_maxcredit = SMSG_MAX_CREDIT;
+    smsg_attr[0].msg_maxsize = SMSG_MAX_MSG;
+    status = GNI_SmsgBufferSizeNeeded(&smsg_attr[0], &smsg_memlen);
+    GNI_RC_CHECK("GNI_GNI_MemRegister mem buffer", status);
+    ret = posix_memalign(&smsg_mailbox_base, 64, smsg_memlen*(mysize));
+    CmiAssert(ret == 0);
+    bzero(smsg_mailbox_base, smsg_memlen*(mysize));
+    
+    status = GNI_MemRegister(nic_hndl, (uint64_t)smsg_mailbox_base,
+            smsg_memlen*(mysize), smsg_rx_cqh,
+            GNI_MEM_READWRITE,   
+            vmdh_index,
+            &my_smsg_mdh_mailbox);
+    register_memory_size += smsg_memlen*(mysize);
+    GNI_RC_CHECK("GNI_GNI_MemRegister mem buffer", status);
+
+    if (myrank == 0)  printf("Charm++> SMSG memory: %1.1fKB\n", 1.0*smsg_memlen*(mysize)/1024);
+    if (myrank == 0 && register_memory_size>=MAX_REG_MEM ) printf("Charm++> FATAL ERROR your program has risk of hanging \n please set CHARM_UGNI_MEMPOOL_MAX  to a larger value or use Dynamic smsg\n");
+
+    base_infor.addr =  (uint64_t)smsg_mailbox_base;
+    base_infor.mdh =  my_smsg_mdh_mailbox;
+    base_addr_vec = malloc(mysize * sizeof(mdh_addr_t));
+
+    allgather(&base_infor, base_addr_vec,  sizeof(mdh_addr_t));
+    for(i=0; i<mysize; i++)
+    {
+        if(i==myrank)
+            continue;
+        smsg_attr[i].msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+        smsg_attr[i].mbox_maxcredit = SMSG_MAX_CREDIT;
+        smsg_attr[i].msg_maxsize = SMSG_MAX_MSG;
+        smsg_attr[i].mbox_offset = i*smsg_memlen;
+        smsg_attr[i].buff_size = smsg_memlen;
+        smsg_attr[i].msg_buffer = smsg_mailbox_base ;
+        smsg_attr[i].mem_hndl = my_smsg_mdh_mailbox;
+    }
+
+    for(i=0; i<mysize; i++)
+    {
+        if (myrank == i) continue;
+
+        remote_smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+        remote_smsg_attr.mbox_maxcredit = SMSG_MAX_CREDIT;
+        remote_smsg_attr.msg_maxsize = SMSG_MAX_MSG;
+        remote_smsg_attr.mbox_offset = myrank*smsg_memlen;
+        remote_smsg_attr.buff_size = smsg_memlen;
+        remote_smsg_attr.msg_buffer = (void*)base_addr_vec[i].addr;
+        remote_smsg_attr.mem_hndl = base_addr_vec[i].mdh;
+
+        /* initialize the smsg channel */
+        status = GNI_SmsgInit(ep_hndl_array[i], &smsg_attr[i], &remote_smsg_attr);
+        GNI_RC_CHECK("SMSG Init", status);
+    } //end initialization
+
+    free(base_addr_vec);
+    free(smsg_attr);
+
+    status = GNI_SmsgSetMaxRetrans(nic_hndl, 4096);
+    GNI_RC_CHECK("SmsgSetMaxRetrans Init", status);
+} 
+
+inline
+static void _init_send_queue(SMSG_QUEUE *queue)
+{
+     int i;
+#if ONE_SEND_QUEUE
+     queue->sendMsgBuf = PCQueueCreate();
+     destpe_avail = (char*)malloc(mysize * sizeof(char));
+#else
+     queue->smsg_msglist_index = (MSG_LIST_INDEX*)malloc(mysize*sizeof(MSG_LIST_INDEX));
+#if SMP_LOCKS
+     queue->nonEmptyQueues = PCQueueCreate();
+#endif
+     for(i =0; i<mysize; i++)
+     {
+         queue->smsg_msglist_index[i].sendSmsgBuf = PCQueueCreate();
+#if SMP_LOCKS
+         queue->smsg_msglist_index[i].pushed = 0;
+         queue->smsg_msglist_index[i].lock = CmiCreateLock();
+         queue->smsg_msglist_index[i].destpe = i;
+#endif
+     }
+#endif
+}
+
+inline
+static void _init_smsg()
+{
+    if(mysize > 1) {
+        if (useDynamicSMSG)
+            _init_dynamic_smsg();
+        else
+            _init_static_smsg();
+    }
+
+    _init_send_queue(&smsg_queue);
+#if CMK_USE_OOB
+    _init_send_queue(&smsg_oob_queue);
+#endif
+}
+
+static void _init_static_msgq()
+{
+    gni_return_t status;
+    /* MSGQ is to send and receive short messages for large jobs (exceeding 200,000 ranks). The          performance scales by the node count rather than rank count */
+    msgq_attrs.max_msg_sz = MSGQ_MAXSIZE;
+    msgq_attrs.smsg_q_sz = 1;
+    msgq_attrs.rcv_pool_sz = 1;
+    msgq_attrs.num_msgq_eps = 2;
+    msgq_attrs.nloc_insts = 8;
+    msgq_attrs.modes = 0;
+    msgq_attrs.rcv_cq_sz = REMOTE_QUEUE_ENTRIES ;
+
+    status = GNI_MsgqInit(nic_hndl, NULL, NULL, NULL, &msgq_attrs, &msgq_handle);
+    GNI_RC_CHECK("MSGQ Init", status);
+
+
+}
+
+
+static CmiUInt8 total_mempool_size = 0;
+static CmiUInt8 total_mempool_calls = 0;
+
+#if USE_LRTS_MEMPOOL
+
+inline
+static void *_alloc_mempool_block(size_t *size, gni_mem_handle_t *mem_hndl, int expand_flag, gni_cq_handle_t cqh)
+{
+    void *pool;
+    int ret;
+    gni_return_t status = GNI_RC_SUCCESS;
+
+    size_t default_size =  expand_flag? _expand_mem : _mempool_size;
+    if (*size < default_size) *size = default_size;
+#if LARGEPAGE
+    // round up to be multiple of _tlbpagesize
+    //*size = (*size + _tlbpagesize - 1)/_tlbpagesize*_tlbpagesize;
+    *size = ALIGNHUGEPAGE(*size);
+#endif
+    total_mempool_size += *size;
+    total_mempool_calls += 1;
+#if   !LARGEPAGE
+    if ((*size > MAX_REG_MEM || *size > MAX_BUFF_SEND) && expand_flag) 
+    {
+        printf("Error: A mempool block with size %lld is allocated, which is greater than the maximum mempool allowed.\n Please increase the max pool size by using +gni-mempool-max or set enviorment variable CHARM_UGNI_MEMPOOL_MAX. (current=%lld, %lld)\n", *size, MAX_REG_MEM, MAX_BUFF_SEND);
+        CmiAbort("alloc_mempool_block");
+    }
+#endif
+#if LARGEPAGE
+    pool = my_get_huge_pages(*size);
+    ret = pool==NULL;
+#else
+    ret = posix_memalign(&pool, ALIGNBUF, *size);
+#endif
+    if (ret != 0) {
+      printf("Charm++> can not allocate memory pool of size %.2fMB. \n", 1.0*(*size)/1024/1024);
+      if (ret == ENOMEM)
+        CmiAbort("alloc_mempool_block: out of memory.");
+      else
+        CmiAbort("alloc_mempool_block: posix_memalign failed");
+    }
+#if LARGEPAGE
+    CmiMemLock();
+    register_count++;
+    MEMORY_REGISTER(onesided_hnd, nic_hndl, pool, *size, mem_hndl, &omdh, cqh, status);
+    CmiMemUnlock();
+    if(status != GNI_RC_SUCCESS) {
+        printf("[%d, %d] memory reigstration %f G (%lld) ask for %lld\n", myrank, CmiMyRank(), register_memory_size/(1024*1024.0*1024),register_count, *size);
+sweep_mempool(CpvAccess(mempool));
+    }
+    GNI_RC_CHECK("MEMORY_REGISTER", status);
+#else
+    SetMemHndlZero((*mem_hndl));
+#endif
+    return pool;
+}
+
+inline
+static void *alloc_mempool_block(size_t *size, gni_mem_handle_t *mem_hndl, int expand_flag)
+{
+    return _alloc_mempool_block(size, mem_hndl, expand_flag, rdma_rx_cqh);
+}
+
+#if CMK_PERSISTENT_COMM_PUT
+inline
+static void *alloc_persistent_mempool_block(size_t *size, gni_mem_handle_t *mem_hndl, int expand_flag)
+{
+    return _alloc_mempool_block(size, mem_hndl, expand_flag, highpriority_rx_cqh);
+}
+#endif
+
+// ptr is a block head pointer
+void free_mempool_block(void *ptr, gni_mem_handle_t mem_hndl)
+{
+    if(!(IsMemHndlZero(mem_hndl)))
+    {
+        MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &mem_hndl, &omdh, GetSizeFromBlockHeader(ptr));
+    }
+#if LARGEPAGE
+    my_free_huge_pages(ptr, GetSizeFromBlockHeader(ptr));
+#else
+    free(ptr);
+#endif
+}
+#endif
+
+void LrtsPreCommonInit(int everReturn){
+#if USE_LRTS_MEMPOOL
+    CpvInitialize(mempool_type*, mempool);
+    CpvAccess(mempool) = mempool_init(_mempool_size, alloc_mempool_block, free_mempool_block, _mempool_size_limit);
+#if CMK_PERSISTENT_COMM_PUT
+    CpvInitialize(mempool_type*, persistent_mempool);
+    CpvAccess(persistent_mempool) = mempool_init(_mempool_size, alloc_persistent_mempool_block, free_mempool_block, _mempool_size_limit);
+#endif
+    MACHSTATE2(8, "mempool_init %d %p\n", CmiMyRank(), CpvAccess(mempool)) ; 
+#endif
+}
+
+void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
+{
+    register int            i;
+    int                     rc;
+    int                     device_id = 0;
+    unsigned int            remote_addr;
+    gni_cdm_handle_t        cdm_hndl;
+    gni_return_t            status = GNI_RC_SUCCESS;
+    uint32_t                vmdh_index = -1;
+    uint8_t                 ptag;
+    unsigned int            local_addr, *MPID_UGNI_AllAddr;
+    int                     first_spawned;
+    int                     physicalID;
+    char                   *env;
+
+    //void (*local_event_handler)(gni_cq_entry_t *, void *)       = &LocalEventHandle;
+    //void (*remote_smsg_event_handler)(gni_cq_entry_t *, void *) = &RemoteSmsgEventHandle;
+    //void (*remote_bte_event_handler)(gni_cq_entry_t *, void *)  = &RemoteBteEventHandle;
+   
+    status = PMI_Init(&first_spawned);
+    GNI_RC_CHECK("PMI_Init", status);
+
+    status = PMI_Get_size(&mysize);
+    GNI_RC_CHECK("PMI_Getsize", status);
+
+    status = PMI_Get_rank(&myrank);
+    GNI_RC_CHECK("PMI_getrank", status);
+
+    //physicalID = CmiPhysicalNodeID(myrank);
+    
+    //printf("Pysical Node ID:%d for PE:%d\n", physicalID, myrank);
+
+    *myNodeID = myrank;
+    *numNodes = mysize;
+  
+#if MULTI_THREAD_SEND
+    /* Currently, we only consider the case that comm. thread will only recv msgs */
+    Cmi_smp_mode_setting = COMM_WORK_THREADS_SEND_RECV;
+#endif
+
+#if CMI_EXERT_SEND_LARGE_CAP
+    CmiGetArgInt(*argv,"+useSendLargeCap", &SEND_large_cap);
+#endif
+
+#if CMI_EXERT_RECV_RDMA_CAP 
+    CmiGetArgInt(*argv,"+useRecvRdmaCap", &RDMA_cap);
+#endif
+  
+#if CMI_SENDBUFFERSMSG_CAP
+    CmiGetArgInt(*argv,"+useSendBufferCap", &SendBufferMsg_cap);
+#endif
+
+#if CMI_PUMPNETWORKSMSG_CAP 
+    CmiGetArgInt(*argv,"+usePumpSmsgCap", &PumpNetworkSmsg_cap);
+#endif
+
+    CmiGetArgInt(*argv,"+useRecvQueue", &REMOTE_QUEUE_ENTRIES);
+    
+    env = getenv("CHARM_UGNI_REMOTE_QUEUE_SIZE");
+    if (env) REMOTE_QUEUE_ENTRIES = atoi(env);
+    CmiGetArgInt(*argv,"+useRecvQueue", &REMOTE_QUEUE_ENTRIES);
+
+    env = getenv("CHARM_UGNI_LOCAL_QUEUE_SIZE");
+    if (env) LOCAL_QUEUE_ENTRIES = atoi(env);
+    CmiGetArgInt(*argv,"+useSendQueue", &LOCAL_QUEUE_ENTRIES);
+
+    env = getenv("CHARM_UGNI_DYNAMIC_SMSG");
+    if (env) useDynamicSMSG = 1;
+    if (!useDynamicSMSG)
+      useDynamicSMSG = CmiGetArgFlag(*argv, "+useDynamicSmsg");
+    CmiGetArgIntDesc(*argv, "+smsgConnection", &avg_smsg_connection,"Initial number of SMSGS connection per code");
+    if (avg_smsg_connection>mysize) avg_smsg_connection = mysize;
+    //useStaticMSGQ = CmiGetArgFlag(*argv, "+useStaticMsgQ");
+    
+    if(myrank == 0)
+    {
+        printf("Charm++> Running on Gemini (GNI) with %d processes\n", mysize);
+        printf("Charm++> %s SMSG\n", useDynamicSMSG?"dynamic":"static");
+    }
+#ifdef USE_ONESIDED
+    onesided_init(NULL, &onesided_hnd);
+
+    // this is a GNI test, so use the libonesided bypass functionality
+    onesided_gni_bypass_get_nih(onesided_hnd, &nic_hndl);
+    local_addr = gniGetNicAddress();
+#else
+    ptag = get_ptag();
+    cookie = get_cookie();
+#if 0
+    modes = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT;
+#endif
+    //Create and attach to the communication  domain */
+    status = GNI_CdmCreate(myrank, ptag, cookie, modes, &cdm_hndl);
+    GNI_RC_CHECK("GNI_CdmCreate", status);
+    //* device id The device id is the minor number for the device
+    //that is assigned to the device by the system when the device is created.
+    //To determine the device number, look in the /dev directory, which contains a list of devices. For a NIC, the device is listed as kgniX
+    //where X is the device number 0 default 
+    status = GNI_CdmAttach(cdm_hndl, device_id, &local_addr, &nic_hndl);
+    GNI_RC_CHECK("GNI_CdmAttach", status);
+    local_addr = get_gni_nic_address(0);
+#endif
+    MPID_UGNI_AllAddr = (unsigned int *)malloc(sizeof(unsigned int) * mysize);
+    _MEMCHECK(MPID_UGNI_AllAddr);
+    allgather(&local_addr, MPID_UGNI_AllAddr, sizeof(unsigned int));
+    /* create the local completion queue */
+    /* the third parameter : The number of events the NIC allows before generating an interrupt. Setting this parameter to zero results in interrupt delivery with every event. When using this parameter, the mode parameter must be set to GNI_CQ_BLOCKING*/
+    status = GNI_CqCreate(nic_hndl, LOCAL_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &default_tx_cqh);
+    GNI_RC_CHECK("GNI_CqCreate (tx)", status);
+#if MULTI_THREAD_SEND
+    status = GNI_CqCreate(nic_hndl, LOCAL_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &rdma_tx_cqh);
+    GNI_RC_CHECK("GNI_CqCreate RDMA (tx)", status);
+#endif
+
+#if CMK_USE_OOB
+    status = GNI_CqCreate(nic_hndl, LOCAL_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &highprior_rdma_tx_cqh);
+    GNI_RC_CHECK("GNI_CqCreate high priority RDMA (tx)", status);
+#endif
+    /* create the destination completion queue for receiving micro-messages, make this queue considerably larger than the number of transfers */
+
+    status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &smsg_rx_cqh);
+    GNI_RC_CHECK("Create CQ (rx)", status);
+    
+    status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &rdma_rx_cqh);
+    GNI_RC_CHECK("Create Post CQ (rx)", status);
+   
+#if CMK_PERSISTENT_COMM_PUT
+    status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &highpriority_rx_cqh);
+    GNI_RC_CHECK("Create Post CQ (rx)", status);
+#endif
+    //status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &rdma_cqh);
+    //GNI_RC_CHECK("Create BTE CQ", status);
+
+    /* create the endpoints. they need to be bound to allow later CQWrites to them */
+    ep_hndl_array = (gni_ep_handle_t*)malloc(mysize * sizeof(gni_ep_handle_t));
+    _MEMCHECK(ep_hndl_array);
+#if MULTI_THREAD_SEND 
+    rx_cq_lock = global_gni_lock = default_tx_cq_lock = smsg_mailbox_lock = CmiCreateLock();
+    //default_tx_cq_lock = CmiCreateLock();
+    rdma_tx_cq_lock = CmiCreateLock();
+    smsg_rx_cq_lock = CmiCreateLock();
+    //global_gni_lock  = CmiCreateLock();
+    //rx_cq_lock  = CmiCreateLock();
+#endif
+    for (i=0; i<mysize; i++) {
+        if(i == myrank) continue;
+        status = GNI_EpCreate(nic_hndl, default_tx_cqh, &ep_hndl_array[i]);
+        GNI_RC_CHECK("GNI_EpCreate ", status);   
+        remote_addr = MPID_UGNI_AllAddr[i];
+        status = GNI_EpBind(ep_hndl_array[i], remote_addr, i);
+        GNI_RC_CHECK("GNI_EpBind ", status);   
+    }
+
+    /* SMSG is fastest but not scale; Msgq is scalable, FMA is own implementation for small message */
+    _init_smsg();
+    PMI_Barrier();
+
+#if     USE_LRTS_MEMPOOL
+    env = getenv("CHARM_UGNI_MAX_MEMORY_ON_NODE");
+    if (env) {
+        _totalmem = CmiReadSize(env);
+        if (myrank == 0)
+            printf("Charm++> total registered memory available per node is %.1fGB\n", (float)(_totalmem*1.0/oneGB));
+    }
+
+    env = getenv("CHARM_UGNI_MEMPOOL_INIT_SIZE");
+    if (env) _mempool_size = CmiReadSize(env);
+    if (CmiGetArgStringDesc(*argv,"+gni-mempool-init-size",&env,"Set the memory pool size")) 
+        _mempool_size = CmiReadSize(env);
+
+
+    env = getenv("CHARM_UGNI_MEMPOOL_MAX");
+    if (env) {
+        MAX_REG_MEM = CmiReadSize(env);
+        user_set_flag = 1;
+    }
+    if (CmiGetArgStringDesc(*argv,"+gni-mempool-max",&env,"Set the memory pool max size"))  {
+        MAX_REG_MEM = CmiReadSize(env);
+        user_set_flag = 1;
+    }
+
+    env = getenv("CHARM_UGNI_SEND_MAX");
+    if (env) {
+        MAX_BUFF_SEND = CmiReadSize(env);
+        user_set_flag = 1;
+    }
+    if (CmiGetArgStringDesc(*argv,"+gni-mempool-max-send",&env,"Set the memory pool max size for send"))  {
+        MAX_BUFF_SEND = CmiReadSize(env);
+        user_set_flag = 1;
+    }
+
+    env = getenv("CHARM_UGNI_MEMPOOL_SIZE_LIMIT");
+    if (env) {
+        _mempool_size_limit = CmiReadSize(env);
+    }
+
+    if (MAX_REG_MEM < _mempool_size) MAX_REG_MEM = _mempool_size;
+    if (MAX_BUFF_SEND > MAX_REG_MEM)  MAX_BUFF_SEND = MAX_REG_MEM;
+
+    if (myrank==0) {
+        printf("Charm++> memory pool init block size: %1.fMB, total memory pool limit %1.fMB (0 means no limit)\n", _mempool_size/1024.0/1024, _mempool_size_limit/1024.0/1024);
+        printf("Charm++> memory pool registered memory limit: %1.fMB, send limit: %1.fMB\n", MAX_REG_MEM/1024.0/1024, MAX_BUFF_SEND/1024.0/1024);
+        if (MAX_REG_MEM < BIG_MSG * 2 + oneMB)  {
+            /* memblock can expand to BIG_MSG * 2 size */
+            printf("Charm++ Error: The mempool maximum size is too small, please use command line option +gni-mempool-max or environment variable CHARM_UGNI_MEMPOOL_MAX to increase the value to at least %1.fMB.\n",  BIG_MSG * 2.0/1024/1024 + 1);
+            CmiAbort("mempool maximum size is too small. \n");
+        }
+#if MULTI_THREAD_SEND
+        printf("Charm++> worker thread sending messages\n");
+#elif COMM_THREAD_SEND
+        printf("Charm++> only comm thread send/recv messages\n");
+#endif
+    }
+
+#endif     /* end of USE_LRTS_MEMPOOL */
+
+    env = getenv("CHARM_UGNI_BIG_MSG_SIZE");
+    if (env) {
+        BIG_MSG = CmiReadSize(env);
+        if (BIG_MSG < ONE_SEG)
+          CmiAbort("BIG_MSG size is too small in the environment variable CHARM_UGNI_BIG_MSG_SIZE.");
+    }
+    env = getenv("CHARM_UGNI_BIG_MSG_PIPELINE_LEN");
+    if (env) {
+        BIG_MSG_PIPELINE = atoi(env);
+    }
+
+    env = getenv("CHARM_UGNI_NO_DEADLOCK_CHECK");
+    if (env) _checkProgress = 0;
+    if (mysize == 1) _checkProgress = 0;
+
+#if CMI_EXERT_RECV_RDMA_CAP
+    env = getenv("CHARM_UGNI_RDMA_MAX");
+    if (env)  {
+        RDMA_pending = atoi(env);
+        if (myrank == 0)
+            printf("Charm++> Max pending RDMA set to: %d\n", RDMA_pending);
+    }
+#endif
+    
+    /*
+    env = getenv("HUGETLB_DEFAULT_PAGE_SIZE");
+    if (env) 
+        _tlbpagesize = CmiReadSize(env);
+    */
+    /* real gethugepagesize() is only available when hugetlb module linked */
+    _tlbpagesize = gethugepagesize();
+    if (myrank == 0) {
+        printf("Charm++> Cray TLB page size: %1.fK\n", _tlbpagesize/1024.0);
+    }
+
+#if LARGEPAGE
+    if (_tlbpagesize == 4096) {
+        CmiAbort("Hugepage module, e.g. craype-hugepages8M must be loaded.");
+    }
+#endif
+
+      /* stats related arguments */
+#if CMK_WITH_STATS
+    CmiGetArgStringDesc(*argv,"+gni_stats_root",&counters_dirname,"counter directory name, default counters");
+
+    print_stats = CmiGetArgFlag(*argv, "+print_stats");
+    
+    stats_off = CmiGetArgFlag(*argv, "+stats_off");
+
+    init_comm_stats();
+#endif
+
+    /* init DMA buffer for medium message */
+
+    //_init_DMA_buffer();
+    
+    free(MPID_UGNI_AllAddr);
+
+    sendRdmaBuf = PCQueueCreate();
+    sendHighPriorBuf = PCQueueCreate();
+
+//    NTK_Init();
+//    ntk_return_t sts = NTK_System_GetSmpdCount(&_smpd_count);
+
+#if  REMOTE_EVENT
+    SHIFT = 1;
+    while (1<<SHIFT < mysize) SHIFT++;
+    CmiAssert(SHIFT < 31);
+    IndexPool_init(&ackPool);
+#if CMK_PERSISTENT_COMM_PUT
+    IndexPool_init(&persistPool);
+#endif
+#endif
+}
+
+void* LrtsAlloc(int n_bytes, int header)
+{
+    void *ptr = NULL;
+#if 0
+    printf("\n[PE:%d]Alloc Lrts for bytes=%d, head=%d %d\n", CmiMyPe(), n_bytes, header, SMSG_MAX_MSG);
+#endif
+    if(n_bytes <= SMSG_MAX_MSG)
+    {
+        int totalsize = n_bytes+header;
+        ptr = malloc(totalsize);
+    }
+    else {
+        CmiAssert(header+sizeof(mempool_header) <= ALIGNBUF);
+#if     USE_LRTS_MEMPOOL
+        n_bytes = ALIGN64(n_bytes);
+        if(n_bytes < BIG_MSG)
+        {
+            char *res = mempool_malloc(CpvAccess(mempool), ALIGNBUF+n_bytes-sizeof(mempool_header), 1);
+            if (res) ptr = res - sizeof(mempool_header) + ALIGNBUF - header;
+        }else 
+        {
+#if LARGEPAGE
+            //printf("[%d] LrtsAlloc a big_msg: %d %d\n", myrank, n_bytes, ALIGNHUGEPAGE(n_bytes+ALIGNBUF));
+            n_bytes = ALIGNHUGEPAGE(n_bytes+ALIGNBUF);
+            char *res = my_get_huge_pages(n_bytes);
+#else
+            char *res = memalign(ALIGNBUF, n_bytes+ALIGNBUF);
+#endif
+            if (res) ptr = res + ALIGNBUF - header;
+        }
+#else
+        n_bytes = ALIGN64(n_bytes);           /* make sure size if 4 aligned */
+        char *res = memalign(ALIGNBUF, n_bytes+ALIGNBUF);
+        ptr = res + ALIGNBUF - header;
+#endif
+    }
+    return ptr;
+}
+
+void  LrtsFree(void *msg)
+{
+    CmiUInt4 size = SIZEFIELD((char*)msg+sizeof(CmiChunkHeader));
+#if CMK_PERSISTENT_COMM_PUT
+    if (IS_PERSISTENT_MEMORY(msg)) return;
+#endif
+    if (size <= SMSG_MAX_MSG)
+        free(msg);
+    else {
+        size = ALIGN64(size);
+        if(size>=BIG_MSG)
+        {
+#if LARGEPAGE
+            int s = ALIGNHUGEPAGE(size+ALIGNBUF);
+            my_free_huge_pages((char*)msg + sizeof(CmiChunkHeader) - ALIGNBUF, s);
+#else
+            free((char*)msg + sizeof(CmiChunkHeader) - ALIGNBUF);
+#endif
+        }
+        else {
+#if    USE_LRTS_MEMPOOL
+#if CMK_SMP
+            mempool_free_thread((char*)msg + sizeof(CmiChunkHeader) - ALIGNBUF + sizeof(mempool_header));
+#else
+            mempool_free(CpvAccess(mempool), (char*)msg + sizeof(CmiChunkHeader) - ALIGNBUF + sizeof(mempool_header));
+#endif
+#else
+            free((char*)msg + sizeof(CmiChunkHeader) - ALIGNBUF);
+#endif
+        }
+    }
+}
+
+void LrtsExit()
+{
+#if CMK_WITH_STATS
+#if CMK_SMP
+    if(CmiMyRank() == CmiMyNodeSize())
+#endif
+    if (print_stats) print_comm_stats();
+#endif
+    /* free memory ? */
+#if USE_LRTS_MEMPOOL
+    //printf("FINAL [%d, %d]  register=%lld, send=%lld\n", myrank, CmiMyRank(), register_memory_size, buffered_send_msg); 
+    mempool_destroy(CpvAccess(mempool));
+#endif
+    PMI_Barrier();
+    PMI_Finalize();
+    exit(0);
+}
+
+void LrtsDrainResources()
+{
+    if(mysize == 1) return;
+    while (
+#if CMK_USE_OOB
+           !SendBufferMsg(&smsg_oob_queue, NULL) ||
+#endif
+           !SendBufferMsg(&smsg_queue, NULL) 
+          )
+    {
+        if (useDynamicSMSG)
+            PumpDatagramConnection();
+        PumpNetworkSmsg();
+        PumpLocalTransactions(default_tx_cqh, default_tx_cq_lock);
+
+#if MULTI_THREAD_SEND
+        PumpLocalTransactions(rdma_tx_cqh, rdma_tx_cq_lock);
+#endif
+
+#if CMK_USE_OOB 
+        PumpLocalTransactions(highprior_rdma_tx_cqh, rdma_tx_cq_lock);
+#endif
+
+#if REMOTE_EVENT
+        PumpRemoteTransactions(rdma_rx_cqh);
+#endif
+        SendRdmaMsg(sendRdmaBuf);
+        SendRdmaMsg(sendHighPriorBuf);
+    }
+    PMI_Barrier();
+}
+
+void LrtsAbort(const char *message) {
+    fprintf(stderr, "[%d] CmiAbort: %s\n", myrank, message);
+    CmiPrintStackTrace(0);
+    PMI_Abort(-1, message);
+}
+
+/**************************  TIMER FUNCTIONS **************************/
+#if CMK_TIMER_USE_SPECIAL
+/* MPI calls are not threadsafe, even the timer on some machines */
+static CmiNodeLock  timerLock = 0;
+static int _absoluteTime = 0;
+static int _is_global = 0;
+static struct timespec start_ts;
+
+inline int CmiTimerIsSynchronized() {
+    return 0;
+}
+
+inline int CmiTimerAbsolute() {
+    return _absoluteTime;
+}
+
+double CmiStartTimer() {
+    return 0.0;
+}
+
+double CmiInitTime() {
+    return (double)(start_ts.tv_sec)+(double)start_ts.tv_nsec/1000000000.0;
+}
+
+void CmiTimerInit(char **argv) {
+    _absoluteTime = CmiGetArgFlagDesc(argv,"+useAbsoluteTime", "Use system's absolute time as wallclock time.");
+    if (_absoluteTime && CmiMyPe() == 0)
+        printf("Charm++> absolute  timer is used\n");
+    
+    _is_global = CmiTimerIsSynchronized();
+
+
+    if (_is_global) {
+        if (CmiMyRank() == 0) {
+            clock_gettime(CLOCK_MONOTONIC, &start_ts);
+        }
+    } else { /* we don't have a synchronous timer, set our own start time */
+        CmiBarrier();
+        CmiBarrier();
+        CmiBarrier();
+        clock_gettime(CLOCK_MONOTONIC, &start_ts);
+    }
+    CmiNodeAllBarrier();          /* for smp */
+}
+
+/**
+ * Since the timerLock is never created, and is
+ * always NULL, then all the if-condition inside
+ * the timer functions could be disabled right
+ * now in the case of SMP.
+ */
+double CmiTimer(void) {
+    struct timespec now_ts;
+    clock_gettime(CLOCK_MONOTONIC, &now_ts);
+    return _absoluteTime?((double)(now_ts.tv_sec)+(double)now_ts.tv_nsec/1000000000.0)
+        : (double)( now_ts.tv_sec - start_ts.tv_sec ) + (((double) now_ts.tv_nsec - (double) start_ts.tv_nsec)  / 1000000000.0);
+}
+
+double CmiWallTimer(void) {
+    struct timespec now_ts;
+    clock_gettime(CLOCK_MONOTONIC, &now_ts);
+    return _absoluteTime?((double)(now_ts.tv_sec)+(double)now_ts.tv_nsec/1000000000.0)
+        : ( now_ts.tv_sec - start_ts.tv_sec ) + ((now_ts.tv_nsec - start_ts.tv_nsec)  / 1000000000.0);
+}
+
+double CmiCpuTimer(void) {
+    struct timespec now_ts;
+    clock_gettime(CLOCK_MONOTONIC, &now_ts);
+    return _absoluteTime?((double)(now_ts.tv_sec)+(double)now_ts.tv_nsec/1000000000.0)
+        : (double)( now_ts.tv_sec - start_ts.tv_sec ) + (((double) now_ts.tv_nsec - (double) start_ts.tv_nsec)  / 1000000000.0);
+}
+
+#endif
+/************Barrier Related Functions****************/
+
+int CmiBarrier()
+{
+    gni_return_t status;
+
+#if CMK_SMP
+    /* make sure all ranks reach here, otherwise comm threads may reach barrier ignoring other ranks  */
+    CmiNodeAllBarrier();
+    if (CmiMyRank() == CmiMyNodeSize())
+#else
+    if (CmiMyRank() == 0)
+#endif
+    {
+        /**
+         *  The call of CmiBarrier is usually before the initialization
+         *  of trace module of Charm++, therefore, the START_EVENT
+         *  and END_EVENT are disabled here. -Chao Mei
+         */
+        /*START_EVENT();*/
+        status = PMI_Barrier();
+        GNI_RC_CHECK("PMI_Barrier", status);
+        /*END_EVENT(10);*/
+    }
+    CmiNodeAllBarrier();
+    return 0;
+
+}
+#if CMK_DIRECT
+#include "machine-cmidirect.c"
+#endif
+#if CMK_PERSISTENT_COMM
+#include "machine-persistent.c"
+#endif
+
diff --git a/src/arch/mpi-crayxc/conv-mach-cuda.h b/src/arch/mpi-crayxc/conv-mach-cuda.h
new file mode 100644 (file)
index 0000000..ae853b9
--- /dev/null
@@ -0,0 +1,10 @@
+#undef  CMK_CUDA
+#define CMK_CUDA                                           1
+
+#undef CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT
+#define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT                   1
+#undef CMK_WHEN_PROCESSOR_IDLE_USLEEP
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP                     0
+
+#undef CMK___int128_DEFINED
+#undef CMK___int128_t_DEFINED
diff --git a/src/arch/mpi-crayxc/conv-mach-cuda.sh b/src/arch/mpi-crayxc/conv-mach-cuda.sh
new file mode 100644 (file)
index 0000000..8319cd3
--- /dev/null
@@ -0,0 +1,5 @@
+BUILD_CUDA=1
+CMK_INCDIR="-I$CUDA_DIR/include $CMK_INCDIR "
+CMK_LIBDIR="-L$CUDA_DIR/lib64 $CMK_LIBDIR "
+CMK_LIBS="-lcuda -lcudart -lcudahybridapi $CMK_LIBS "
+
diff --git a/src/arch/mpi-crayxc/conv-mach-smp.h b/src/arch/mpi-crayxc/conv-mach-smp.h
new file mode 100644 (file)
index 0000000..c27b039
--- /dev/null
@@ -0,0 +1,11 @@
+#define CMK_SMP                                            1
+
+#undef CMK_NODE_QUEUE_AVAILABLE
+#define CMK_NODE_QUEUE_AVAILABLE                           1
+
+#undef CMK_SHARED_VARS_UNAVAILABLE
+#undef CMK_SHARED_VARS_POSIX_THREADS_SMP
+#define CMK_SHARED_VARS_UNAVAILABLE                        0
+#define CMK_SHARED_VARS_POSIX_THREADS_SMP                  1
+
+#define CMK_THREADS_USE_JCONTEXT                           1
diff --git a/src/arch/mpi-crayxc/conv-mach-smp.sh b/src/arch/mpi-crayxc/conv-mach-smp.sh
new file mode 100644 (file)
index 0000000..a0cc0e4
--- /dev/null
@@ -0,0 +1,10 @@
+CMK_DEFS=' -D_REENTRANT '
+CMK_CPP_C="$CMK_CPP_C $CMK_DEFS"
+CMK_CC="$CMK_CC $CMK_DEFS"
+CMK_CC_RELIABLE="$CMK_CC_RELIABLE $CMK_DEFS "
+CMK_CC_FASTEST="$CMK_CC_FASTEST $CMK_DEFS "
+CMK_CXX="$CMK_CXX  $CMK_DEFS "
+CMK_CXXPP="$CMK_CXXPP $CMK_DEFS "
+CMK_LD="$CMK_LD $CMK_DEFS "
+CMK_LDXX="$CMK_LDXX $CMK_DEFS "
+CMK_LIBS="-lpthread $CMK_LIBS "
diff --git a/src/arch/mpi-crayxc/conv-mach.h b/src/arch/mpi-crayxc/conv-mach.h
new file mode 100644 (file)
index 0000000..20e902c
--- /dev/null
@@ -0,0 +1,63 @@
+
+#ifndef _CONV_MACH_H
+#define _CONV_MACH_H
+
+#define CMK_CRAYXC                                         1
+
+#define CMK_CONVERSE_MPI                                   1
+
+#define CMK_MEMORY_PREALLOCATE_HACK                       0
+
+#define CMK_DEFAULT_MAIN_USES_COMMON_CODE                  1
+
+#define CMK_MALLOC_USE_GNU_MALLOC                          0
+#define CMK_MALLOC_USE_OS_BUILTIN                          1
+
+#define CMI_IO_BUFFER_EXPLICIT                             1
+#define CMI_IO_FLUSH_USER                                  1
+
+#define CMK_GETPAGESIZE_AVAILABLE                         1
+#define CMK_MEMORY_PAGESIZE                               8192
+#define CMK_MEMORY_PROTECTABLE                            0
+
+#define CMK_NODE_QUEUE_AVAILABLE                           0
+
+#define CMK_SHARED_VARS_UNAVAILABLE                        1
+#define CMK_SHARED_VARS_UNIPROCESSOR                       0
+
+#define CMK_SIGNAL_NOT_NEEDED                              0
+#define CMK_SIGNAL_USE_SIGACTION                           0
+#define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART              1
+
+#define CMK_THREADS_USE_CONTEXT                            0
+#define CMK_THREADS_USE_PTHREADS                           0
+
+#define CMK_TIMER_USE_GETRUSAGE                            0
+#define CMK_TIMER_USE_SPECIAL                              1
+#define CMK_TIMER_USE_TIMES                                0
+#define CMK_TIMER_USE_XT3_DCLOCK                           0
+
+#define CMK_TYPEDEF_INT2 short
+#define CMK_TYPEDEF_INT4 int
+#define CMK_TYPEDEF_INT8 long
+#define CMK_TYPEDEF_UINT2 unsigned short
+#define CMK_TYPEDEF_UINT4 unsigned int
+#define CMK_TYPEDEF_UINT8 unsigned long
+#define CMK_TYPEDEF_FLOAT4 float
+#define CMK_TYPEDEF_FLOAT8 double
+
+#define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT                   1
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP                     0
+
+#define CMK_64BIT                                         1
+#define CMK_AMD64                                         1
+
+#define CMK_WEB_MODE                                       1
+#define CMK_DEBUG_MODE                                     0
+
+#define CMK_LBDB_ON                                       1
+
+#define CMK_DISABLE_SYNC                                   1
+
+#endif
+
diff --git a/src/arch/mpi-crayxc/conv-mach.sh b/src/arch/mpi-crayxc/conv-mach.sh
new file mode 100644 (file)
index 0000000..a30928f
--- /dev/null
@@ -0,0 +1,68 @@
+#CMK_DEFS="-I/opt/xt-mpt/1.5.47/mpich2-64/T/include "
+#CMK_LD_DEFS="-lrca "
+
+CMK_BUILD_CRAY=1
+
+PGCC=`CC -V 2>&1 | grep pgCC`
+ICPC=`CC -V 2>&1 | grep Intel`
+GNU=`CC -V 2>&1 | grep 'g++'`
+
+CMK_CPP_CHARM="/lib/cpp -P"
+CMK_CPP_C="cc -E $CMK_DEFS "
+CMK_CXXPP="CC -E $CMK_DEFS "
+CMK_CC="cc $CMK_DEFS "
+CMK_CXX="CC  $CMK_DEFS "
+CMK_LD="$CMK_CC $CMK_LD_DEFS"
+CMK_LDXX="$CMK_CXX $CMK_LD_DEFS"
+# Swap these and set XT[45]_TOPOLOGY in conv-mach.h if doing topo work
+# on a Cray XT of known dimensions. See src/util/CrayNid.c for details
+#CMK_LIBS="-lckqt -lrca"
+CMK_LIBS="-lckqt"
+
+CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/"
+
+# compiler for compiling sequential programs
+if test -n "$PGCC"
+then
+CMK_CC="$CMK_CC -DCMK_FIND_FIRST_OF_PREDICATE=1 "
+CMK_CXX="$CMK_CXX -DCMK_FIND_FIRST_OF_PREDICATE=1 --no_using_std "
+# gcc is needed for building QT
+CMK_SEQ_CC="gcc -fPIC "
+CMK_SEQ_CXX="pgCC -fPIC --no_using_std "
+elif test -n "$ICPC"
+then
+CMK_SEQ_CC="cc -fPIC "
+CMK_SEQ_CXX="CC -fPIC "
+else
+CMK_SEQ_CC="gcc -fPIC"
+CMK_SEQ_CXX="g++ -fPIC "
+fi
+CMK_SEQ_LD="$CMK_SEQ_CC "
+CMK_SEQ_LDXX="$CMK_SEQ_CXX "
+CMK_SEQ_LIBS=""
+
+# compiler for native programs
+CMK_NATIVE_CC="gcc "
+CMK_NATIVE_LD="gcc "
+CMK_NATIVE_CXX="g++ "
+CMK_NATIVE_LDXX="g++ "
+CMK_NATIVE_LIBS=""
+
+CMK_RANLIB="ranlib"
+CMK_QT="generic64"
+
+# for F90 compiler
+CMK_CF77="ftn "
+CMK_CF90="ftn "
+if test -n "$GNU"
+then
+    CMK_CF77="$CMK_CF77 -ffree-line-length-none"
+    CMK_CF90="$CMK_CF90 -ffree-line-length-none"
+fi
+CMK_F90LIBS=""
+CMK_F90_USE_MODDIR=1
+CMK_F90_MODINC="-I"
+CMK_MOD_EXT="mod"
+
+CMK_NO_BUILD_SHARED=true
+
diff --git a/src/arch/mpi-crayxc/special.sh b/src/arch/mpi-crayxc/special.sh
new file mode 100755 (executable)
index 0000000..02ea682
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+export CHARMINC=../include
+. ./conv-config.sh
+
+if test -n "$BUILD_CUDA"
+then
+  echo "---------------------- special.sh for cuda executing ----------------"
+
+  ./gathertree ../../src/arch/cuda .
+
+# make links
+  test ! -f "../include/cuda-hybrid-api.h" && ./system_ln "../tmp/hybridAPI/cuda-hybrid-api.h" ../include && test ! -f "../include/wr.h" && ./system_ln "../tmp/hybridAPI/wr.h" ../include && test ! -f "../include/wrqueue.h" && ./system_ln "../tmp/hybridAPI/wrqueue.h" ../include
+
+#make library
+  export CHARMINC=../include
+  . ./conv-config.sh
+
+fi
index d3d33795c18f7536c94a3f898e16385e0d2e5c88..47f02c937b03f5b8b7940f8c80f76c527c44bb93 100644 (file)
@@ -241,7 +241,7 @@ void CmiInitPxshm(char **argv){
         if (_Cmi_mynode == 0)
             printf("Charm++> pxshm enabled: %d cores per node, buffer size: %.1fMB\n", pxshmContext->nodesize, SHMBUFLEN/1024.0/1024.0);
 
-#if CMK_CRAYXE
+#if CMK_CRAYXE || CMK_CRAYXC
         srand(getpid());
         int Cmi_charmrun_pid = rand();
         PMI_Bcast(&Cmi_charmrun_pid, sizeof(int));
@@ -561,7 +561,7 @@ void setupSharedBuffers(){
                }
        }
 
-#if CMK_SMP && CMK_CRAYXE
+#if CMK_SMP && ( CMK_CRAYXE || CMK_CRAYXC )
         if (PMI_Barrier() != GNI_RC_SUCCESS) return;
 #else
         if (CmiBarrier() != 0) return;
index 6de6b561c171abd2410bc2d7b79208213e4cc510..1ac390f5389171457660d16dbab211d61bef88ad 100644 (file)
@@ -227,7 +227,7 @@ void CmiInitXpmem(char **argv){
             CmiAbort("Opening /dev/xpmem");
         }
 
-#if CMK_CRAYXE
+#if CMK_CRAYXE || CMK_CRAYXC
         srand(getpid());
         int Cmi_charmrun_pid = rand();
         PMI_Bcast(&Cmi_charmrun_pid, sizeof(int));
index 3168728b6c93789a31f9941b0adea9435f6beae3..6fb937c651c462ae87ad58148df71692f841f469 100644 (file)
@@ -221,7 +221,7 @@ mempool_type *mempool_init(size_t pool_size, mempool_newblockfn allocfn, mempool
   mptr->block_tail = 0;
   mptr->limit = limit;
   mptr->size = pool_size;
-#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_GEMINI_UGNI)
+#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_UGNI)
   mptr->mempoolLock = CmiCreateLock();
 #endif
   mptr->block_head.mptr = pool;
@@ -230,7 +230,7 @@ mempool_type *mempool_init(size_t pool_size, mempool_newblockfn allocfn, mempool
   mptr->block_head.used = 0;
   mptr->block_head.block_prev = 0;
   mptr->block_head.block_next = 0;
-#if CMK_CONVERSE_GEMINI_UGNI
+#if CMK_CONVERSE_UGNI
   mptr->block_head.msgs_in_send= 0;
   mptr->block_head.msgs_in_recv= 0;
 #endif
@@ -266,7 +266,7 @@ void*  mempool_malloc(mempool_type *mptr, int size, int expand)
     slot_header   *head_free,*head_next;
     mem_handle_t  mem_hndl;
 
-#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_GEMINI_UGNI)
+#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_UGNI)
     CmiLock(mptr->mempoolLock);
 #endif
 
@@ -323,7 +323,7 @@ void*  mempool_malloc(mempool_type *mptr, int size, int expand)
       current->used = 0;
       current->size = expand_size;
       current->block_next = 0;
-#if CMK_CONVERSE_GEMINI_UGNI
+#if CMK_CONVERSE_UGNI
       current->msgs_in_send= 0;
       current->msgs_in_recv = 0;
 #endif
@@ -347,7 +347,7 @@ void*  mempool_malloc(mempool_type *mptr, int size, int expand)
 
       head_free->block_ptr = current;
       current->used += power;
-#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_GEMINI_UGNI)
+#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_UGNI)
       CmiUnlock(mptr->mempoolLock);
 #endif
       return (char*)head_free + sizeof(used_header);
@@ -357,7 +357,7 @@ void*  mempool_malloc(mempool_type *mptr, int size, int expand)
     return NULL;
 }
 
-#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_GEMINI_UGNI)
+#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_UGNI)
 void mempool_free_thread( void *ptr_free)
 {
     slot_header *to_free;
@@ -478,7 +478,7 @@ void mempool_free(mempool_type *mptr, void *ptr_free)
 #endif
 }
 
-#if CMK_CONVERSE_GEMINI_UGNI
+#if CMK_CONVERSE_UGNI
 inline void* getNextRegisteredPool(void *current)
 {
     
index 23e088dd683d701d0dd4ade2af224d11f1375e42..5697199f46c764c362ac65449c98a23a2cb2ea0c 100644 (file)
@@ -5,7 +5,7 @@
 #include "conv-config.h"
 #include "converse.h"
 
-#if CMK_CONVERSE_GEMINI_UGNI
+#if CMK_CONVERSE_UGNI
 #include "gni_pub.h"
 #include "pmi.h"
 typedef gni_mem_handle_t    mem_handle_t;
@@ -78,7 +78,7 @@ typedef struct block_header
   size_t              block_prev,block_next;   // offset to next memblock
   size_t              freelists[cutOffNum];
   struct mempool_type  *mptr;               // mempool_type
-#if CMK_CONVERSE_GEMINI_UGNI
+#if CMK_CONVERSE_UGNI
   int                 msgs_in_send;
   int                 msgs_in_recv;
 #endif
@@ -96,7 +96,7 @@ typedef struct mempool_type
   size_t                 block_tail;
   size_t                 limit;
   size_t                 size;
-#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_GEMINI_UGNI)
+#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_UGNI)
   CmiNodeLock           mempoolLock;
 #endif
 } mempool_type;
@@ -109,7 +109,7 @@ mempool_type *mempool_init(size_t pool_size, mempool_newblockfn newfn, mempool_f
 void  mempool_destroy(mempool_type *mptr);
 void*  mempool_malloc(mempool_type *mptr, int size, int expand);
 void mempool_free(mempool_type *mptr, void *ptr_free);
-#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_GEMINI_UGNI)
+#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_UGNI)
 void mempool_free_thread(void *ptr_free);
 #endif
 
@@ -117,7 +117,7 @@ void mempool_free_thread(void *ptr_free);
 }
 #endif
 
-#if CMK_CONVERSE_GEMINI_UGNI
+#if CMK_CONVERSE_UGNI
 void* getNextRegisteredPool();
 #endif
 
index bbc2cf60fab67a04702080b35836f59b5eb74b1b..f2bc13d9c29717965792ee884d038e16233a15cd 100644 (file)
@@ -1292,7 +1292,7 @@ void _initCharm(int unused_argc, char **argv)
         }
     }
 
-#if CMK_USE_PXSHM && CMK_CRAYXE && CMK_SMP
+#if CMK_USE_PXSHM && ( CMK_CRAYXE || CMK_CRAYXC ) && CMK_SMP
       // for SMP on Cray XE6 (hopper) it seems pxshm has to be initialized
       // again after cpuaffinity is done
     if (CkMyRank() == 0) {
index 4dad0549eba8f9126bd4d0edd9c31f1edb6dd96e..e031995e5eabb242690a79093d16886bc7048c20 100644 (file)
@@ -19,7 +19,7 @@ typedef struct {
 */
 #ifdef CMK_BLUEGENEP
 #include "dcmf.h"
-#elif  CMK_CONVERSE_GEMINI_UGNI
+#elif  CMK_CONVERSE_UGNI
 #include "gni_pub.h"
 #endif
 typedef struct infiDirectUserHandle{
@@ -38,7 +38,7 @@ typedef struct infiDirectUserHandle{
     DCMF_Memregion_t DCMF_recverMemregion;
     DCMF_Memregion_t DCMF_senderMemregion;
     DCMF_Callback_t DCMF_notify_cb;
-#elif  CMK_CONVERSE_GEMINI_UGNI
+#elif  CMK_CONVERSE_UGNI
     int localNode;
     int remoteRank;
     int remoteNode;
@@ -61,7 +61,7 @@ typedef struct infiDirectUserHandle{
        double initialValue;
 }CmiDirectUserHandle;
 
-#ifdef  CMK_CONVERSE_GEMINI_UGNI
+#ifdef  CMK_CONVERSE_UGNI
 typedef gni_mem_handle_t    CmiDirectMemoryHandler;
 CmiDirectMemoryHandler CmiDirect_registerMemory(void *buff, int size);
 struct infiDirectUserHandle CmiDirect_createHandle_mem(CmiDirectMemoryHandler *mem_hndl, void *recvBuf, int recvBufSize, void (*callbackFnPtr)(void *), void *callbackData);
index 1b1d3f0e3906a1a3cb3fe09807dc6e45e48de5a9..cf04de86ff9617698f2dcecf33d77989fd15adf9 100644 (file)
 #define CMK_BROADCAST_USE_CMIREFERENCE                      0
 #endif
 
-#if CMK_CRAYXE && CMK_CONVERSE_GEMINI_UGNI && ! CMK_SMP
+#if (CMK_CRAYXE || CMK_CRAYXC) && CMK_CONVERSE_UGNI && ! CMK_SMP
 #include "conv-mach-pxshm.h"
 #endif
 
index a8af6481f42d360e447d5529d766318fb6f870a8..eb5e43a3db27217a543de98bea743f9d26d7acb1 100644 (file)
@@ -227,7 +227,7 @@ CpvDeclare(void *, CkGridObject);
 CpvDeclare(void *, CsdGridQueue);
 #endif
 
-#if CMK_CRAYXE
+#if CMK_CRAYXE || CMK_CRAYXC
 void* LrtsAlloc(int, int);
 void  LrtsFree(void*);
 #endif
@@ -2856,7 +2856,7 @@ void *CmiAlloc(int size)
   res = (char*) arena_malloc(size+sizeof(CmiChunkHeader));
 #elif CMK_USE_IBVERBS | CMK_USE_IBUD
   res = (char *) infi_CmiAlloc(size+sizeof(CmiChunkHeader));
-#elif CMK_CONVERSE_GEMINI_UGNI
+#elif CMK_CONVERSE_UGNI
   res =(char *) LrtsAlloc(size, sizeof(CmiChunkHeader));
 #elif CONVERSE_POOL
   res =(char *) CmiPoolAlloc(size+sizeof(CmiChunkHeader));
@@ -2961,7 +2961,7 @@ void CmiFree(void *blk)
       }
 #endif
     infi_CmiFree(BLKSTART(parentBlk));
-#elif CMK_CONVERSE_GEMINI_UGNI
+#elif CMK_CONVERSE_UGNI
     LrtsFree(BLKSTART(parentBlk));
 #elif CONVERSE_POOL
     CmiPoolFree(BLKSTART(parentBlk));
index d3c06441adb21b3ba5519c4c368c7dcda1c60df4..7d8279cb35abd4ba6cd2fb374527aa467c41b45f 100644 (file)
@@ -1962,7 +1962,7 @@ extern int *memCriticalEntries;
 
 double CmiReadSize(const char *str);
 
-#if  CMK_CONVERSE_GEMINI_UGNI
+#if  CMK_CONVERSE_UGNI
 void CmiTurnOnStats();
 void CmiTurnOffStats();
 #else
index ce7ab8c77d5e3349734a9157cf3b92d670b00193..395fb17f7d840683ab53e447262eb7e71484fb51 100644 (file)
@@ -539,7 +539,7 @@ static int search_pemap(char *pecoremap, int pe)
   return i;
 }
 
-#if CMK_CRAYXT || CMK_CRAYXE
+#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
 extern int getXTNodeID(int mpirank, int nummpiranks);
 #endif
 
@@ -639,7 +639,7 @@ void CmiInitCPUAffinity(char **argv)
     }
     else {
     /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
-#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_BLUEGENEQ
+#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ
       if (pemap == NULL) {
 #if CMK_MACHINE_PROGRESS_DEFINED
         while (affinity_doneflag < CmiMyNodeSize())  CmiNetworkProgress();
@@ -650,7 +650,7 @@ void CmiInitCPUAffinity(char **argv)
 #endif
       }
 #endif
-#if CMK_CRAYXT || CMK_CRAYXE
+#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
       /* if both pemap and commmap are NULL, will compute one */
       if (pemap != NULL)      
 #endif
@@ -676,7 +676,7 @@ void CmiInitCPUAffinity(char **argv)
     return;
   }
 
-#if CMK_CRAYXT || CMK_CRAYXE
+#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
   {
     int numPes = CmiNumPes();
     int numNodes = CmiNumNodes();
index 49f439d0fce9e99a8cebdde747331ab9cfb7512f..689e471890474824aebe400a2fd9de5690632d29 100644 (file)
@@ -35,7 +35,7 @@
 #include "TopoManager.h"
 #endif
 
-#if CMK_CRAYXT || CMK_CRAYXE
+#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
 extern "C" int getXTNodeID(int mpirank, int nummpiranks);
 #endif
 
@@ -485,7 +485,7 @@ extern "C" void LrtsInitCpuTopo(char **argv)
     if (CmiMyPe()==0)  CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
   }
   CmiNodeAllBarrier();
-#elif CMK_CRAYXT || CMK_CRAYXE
+#elif CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
   if(CmiMyRank() == 0) {
     int numPes = cpuTopo.numPes = CmiNumPes();
     int numNodes = CmiNumNodes();
index 2e0533717afbe83648473a1d4109c03d23f740e0..6be0f41b523b4214f97fedb2c0828ee9041d3959 100644 (file)
@@ -2693,7 +2693,7 @@ void CmiIsomallocBlockListPup(pup_er p,CmiIsomallocBlockList **lp, CthThread tid
         newblock = (char*)newblock + flags[0];
       }
     }
-#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_GEMINI_UGNI)
+#if CMK_USE_MEMPOOL_ISOMALLOC || (CMK_SMP && CMK_CONVERSE_UGNI)
     mptr->mempoolLock = CmiCreateLock();
 #endif  
   }
index b3899fbee5882e15eb8ff280c5d4aadcd1b14391..cfb86375f772fe81a6255661b765311c271f24b8 100644 (file)
@@ -389,7 +389,7 @@ static CMK_TYPEDEF_UINT8 MemusageMallinfo(){
     CMK_TYPEDEF_UINT8 memtotal2 = (CMK_TYPEDEF_UINT8) mi.usmblks;   /* unused */
     memtotal2 += (CMK_TYPEDEF_UINT8) mi.hblkhd;               /* mmap */
     /* printf("%lld %lld %lld %lld %lld\n", mi.uordblks, mi.usmblks,mi.hblkhd,mi.arena,mi.keepcost); */
-#if ! CMK_CRAYXT && ! CMK_CRAYXE
+#if ! CMK_CRAYXT && ! CMK_CRAYXE && !CMK_CRAYXC
     if(memtotal2 > memtotal) memtotal = memtotal2;
 #endif
     return memtotal;
index ecf5943d294058a2d23be8249f08e29562d12309..961ee81bff1aa2edb89f0c52de567a4942fab11b 100644 (file)
@@ -119,7 +119,7 @@ CMK_LDXX="$CMK_LDXX $CMK_LIBDIR "
 #CMK_NATIVE_LD="$CMK_NATIVE_LD $CMK_LIBDIR "
 #CMK_NATIVE_LDXX="$CMK_NATIVE_LDXX $CMK_LIBDIR "
 
-if [ -n "$GEMINI_CRAYXE" -a -z "$CMK_SMP" ]
+if [ -n "$GNI_CRAYXE" -o -n "$GNI_CRAYXC" ] && [ -z "$CMK_SMP" ]
 then
   . $CHARMINC/conv-mach-pxshm.sh
 fi
index 82fcb97a7a2e91ee871aeaf002c91c6f6ab39014..e369c02452ee2261dbf87b5c08f6871eb24ce75e 100644 (file)
@@ -10,7 +10,7 @@
 #include <stdlib.h>
 #include "converse.h"
 
-#if CMK_CRAYXT || CMK_CRAYXE
+#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
 
 #if XT3_TOPOLOGY
 #else  /* if it is a XT4/5 or XE */
@@ -35,7 +35,7 @@ int getXTNodeID(int mpirank, int nummpiranks) {
   return nid;
 }
 
-#endif /* CMK_CRAYXT || CMK_CRAYXE */
+#endif /* CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC */
 
 #if XT4_TOPOLOGY || XT5_TOPOLOGY || XE6_TOPOLOGY