Merging
authorHarshitha <gplkrsh2@illinois.edu>
Mon, 2 Apr 2012 17:52:01 +0000 (12:52 -0500)
committerHarshitha <gplkrsh2@illinois.edu>
Mon, 2 Apr 2012 17:52:01 +0000 (12:52 -0500)
170 files changed:
CHANGES
README
examples/Makefile
examples/ParFUM/Makefile
examples/ampi/Makefile
examples/armci/Makefile
examples/bigsim/Makefile
examples/bigsim/sdag/Makefile
examples/bigsim/sdag/jacobi-no-redn/Makefile
examples/charm++/AllReduce/Makefile
examples/charm++/AllReduce/bench/Makefile
examples/charm++/Makefile
examples/charm++/PMEMimic/Makefile
examples/charm++/PMEMimic/PMEMimic/Makefile
examples/charm++/PMEMimic/PMEMimic_Parallel/Makefile
examples/charm++/RedExample/Makefile
examples/charm++/X10/Makefile
examples/charm++/X10/X10_test.decl.h [deleted file]
examples/charm++/X10/X10_test.def.h [deleted file]
examples/charm++/commlib/Makefile
examples/charm++/commlib/multicast/Makefile
examples/charm++/hello/1darray/Makefile
examples/charm++/hello/3darray/Makefile
examples/charm++/hello/Makefile
examples/charm++/hello/darray/Makefile
examples/charm++/hello/fancyarray/Makefile
examples/charm++/integrate2/Makefile
examples/charm++/integrateArray/Makefile
examples/charm++/load_balancing/Makefile
examples/charm++/piArray/Makefile
examples/charm++/ring/Makefile
examples/charm++/rings/Makefile
examples/charm++/speeds/Makefile
examples/converse/Makefile
examples/converse/pingpong/pingpong.C
examples/fem/Makefile
examples/fem/simple2D/Makefile
src/arch/bluegenep/machine.c
src/arch/gemini_gni-crayxe/conv-mach.h
src/arch/gemini_gni/conv-common.h
src/arch/gemini_gni/machine-persistent.c
src/arch/gemini_gni/machine-persistent.h
src/arch/gemini_gni/machine.c
src/arch/mpi-bluegeneq/cc-xlc.sh
src/arch/mpi-bluegeneq/conv-mach.sh
src/arch/mpi/conv-mach-causalft.h [new file with mode: 0644]
src/arch/mpi/machine.c
src/arch/net-linux-x86_64/conv-mach.sh
src/arch/net-sol-x86_64/conv-mach.sh
src/arch/net/charmrun/charmrun.c
src/arch/pami-bluegeneq/cc-xlc.sh [new file with mode: 0644]
src/arch/pami-bluegeneq/conv-mach-smp.h [new file with mode: 0644]
src/arch/pami-bluegeneq/conv-mach-smp.sh [new file with mode: 0644]
src/arch/pami-bluegeneq/conv-mach.h [new file with mode: 0644]
src/arch/pami-bluegeneq/conv-mach.sh [new file with mode: 0644]
src/arch/pami/Makefile.machine [new file with mode: 0644]
src/arch/pami/conv-common.h [new file with mode: 0644]
src/arch/pami/conv-common.sh [new file with mode: 0644]
src/arch/pami/machine.c [new file with mode: 0644]
src/arch/uth-linux-x86_64/conv-mach.h
src/arch/util/machine-broadcast.c
src/arch/util/machine-common-core.c
src/arch/util/machine-lrts.h
src/arch/util/machine-pxshm.c
src/arch/util/machine-smp.c
src/arch/util/mempool.c
src/arch/util/mempool.h
src/arch/util/persist-comm.c
src/ck-core/charm++.h
src/ck-core/ck.C
src/ck-core/ck.h
src/ck-core/ckarray.C
src/ck-core/ckarray.ci
src/ck-core/ckarray.h
src/ck-core/ckcausalmlog.C
src/ck-core/ckcausalmlog.h
src/ck-core/cklocation.C
src/ck-core/ckreduction.C
src/ck-core/ckreduction.h
src/ck-core/envelope.h
src/ck-core/init.C
src/ck-core/mpi-interoperate.C [new file with mode: 0644]
src/ck-core/mpi-interoperate.h [new file with mode: 0644]
src/ck-core/mpi-mainmodule.C [new file with mode: 0644]
src/ck-core/mpi-mainmodule.ci [new file with mode: 0644]
src/ck-ldb/CentralLB.C
src/ck-ldb/CentralLB.h
src/ck-ldb/CommAwareRefineLB.C [new file with mode: 0644]
src/ck-ldb/CommAwareRefineLB.ci [new file with mode: 0644]
src/ck-ldb/CommAwareRefineLB.h [new file with mode: 0644]
src/ck-ldb/CommonLBs.ci
src/ck-ldb/EveryLB.ci
src/ck-ldb/Make.lb
src/ck-ldb/Makefile_lb.sh
src/ck-ldb/MetisLB.C
src/ck-ldb/RefineSwapLB.C
src/ck-ldb/ScotchLB.C
src/ck-ldb/ScotchRefineLB.C
src/ck-perf/trace-projections.C
src/ck-perf/trace-projections.h
src/conv-core/conv-trace.h
src/conv-core/convcore.c
src/conv-core/converse.h
src/conv-core/cpuaffinity.c
src/conv-core/cputopology.C
src/conv-core/isomalloc.c
src/conv-core/persistent.h
src/langs/bluegene/bigsim_logs.C
src/langs/bluegene/bigsim_read.C
src/langs/charj/share/charj.vim
src/langs/charj/src/charj/libs/Array.h
src/langs/charj/src/charj/translator/Charj.g
src/langs/charj/src/charj/translator/Charj.stg
src/langs/charj/src/charj/translator/CharjEmitter.g
src/langs/charj/src/charj/translator/CharjPostAnalysis.g
src/langs/charj/src/charj/translator/CharjPreAnalysis.g
src/langs/charj/src/charj/translator/ClassSymbol.java
src/langs/charj/src/charj/translator/MethodSymbol.java
src/langs/charj/src/charj/translator/PackageScope.java
src/langs/charj/src/charj/translator/Pair.java [new file with mode: 0644]
src/langs/charj/src/charj/translator/SymbolDefiner.g
src/langs/charj/src/charj/translator/SymbolResolver.g
src/langs/charj/src/charj/translator/SymbolTable.java
src/langs/charj/tests/functional/EntryInvocation.cj
src/libs/ck-libs/NDMeshStreamer/NDMeshStreamer.ci
src/libs/ck-libs/NDMeshStreamer/NDMeshStreamer.h
src/libs/ck-libs/ampi/ampi.C
src/libs/ck-libs/ampi/ampiimpl.h
src/libs/ck-libs/multicast/ckmulticast.C
src/libs/ck-libs/parmetis/METISLib/util.c
src/libs/ck-libs/parmetis/ParMETISLib/util.c
src/libs/ck-libs/pose/gvt.ci
src/libs/ck-libs/pose/sim.C
src/libs/ck-libs/pose/sim.ci
src/libs/ck-libs/search/problem.C
src/scripts/Make.cidepends
src/scripts/Make.depends
src/scripts/Makefile
src/scripts/charmc
src/scripts/configure
src/scripts/configure.in
src/scripts/conv-autoconfig.h.in
src/util/BGQTorus.h [new file with mode: 0644]
src/util/CrayNid.c
src/util/TopoManager.C
src/util/TopoManager.h
src/util/XT3Torus.h
src/util/XTTorus.h
src/xlat-i/sdag/CEntry.C
src/xlat-i/sdag/CEntry.h
src/xlat-i/sdag/CParsedFile.h
src/xlat-i/sdag/CSdagConstruct.C
src/xlat-i/xi-symbol.C
src/xlat-i/xi-symbol.h
src/xlat-i/xi-util.h
tests/Makefile
tests/ampi/Makefile
tests/ampi/stacksize/Makefile
tests/charm++/array4d/Makefile
tests/charm++/delegation/Makefile
tests/charm++/delegation/multicast/Makefile
tests/charm++/hello-crosscorruption/Makefile
tests/charm++/load_balancing/Makefile
tests/charm++/pingpong/Makefile
tests/charm++/sdag/Makefile
tests/charm++/sdag/template/Makefile [new file with mode: 0644]
tests/charm++/sdag/template/pgm.C [new file with mode: 0644]
tests/charm++/sdag/template/pgm.ci [new file with mode: 0644]
tests/charm++/simplearrayhello/Makefile
tests/fem/Makefile

diff --git a/CHANGES b/CHANGES
index 0821c467fa00d8f9d527fb5e5a8194eb87ba4b65..5f4f78cfc494f768a108c0fbcf4d78d46d4e2954 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -13,9 +13,9 @@ Platform Support
 - Cray XE and XK systems using the Gemini network via either MPI
   (mpi-crayxe) or the native uGNI (gemini_gni-crayxe)
 
-- IBM Blue Gene Q, using MPI (mpi-bluegeneq)
+- IBM Blue Gene Q, using MPI (mpi-bluegeneq) or PAMI (pami-bluegeneq)
 
-- Fujitsu and Clang compilers
+- Clang, Cray, and Fujitsu compilers
 
 - MPI-based machine layers can now run on >64k PEs
 
@@ -28,7 +28,11 @@ General Changes
 
 - Enabled pipelining of large messages in CkMulticast by default
 
-- New load balancers added: TreeMatch, Zoltan, Scotch{Refine,Topo}, RefineSwap
+- New load balancers added:
+  * TreeMatch
+  * Zoltan
+  * Scotch graph paritioning based: ScotchLB and Refine and Topo variants
+  * RefineSwap
 
 - Load balancing improvements:
 
@@ -38,8 +42,22 @@ General Changes
   * User code can request a callback when migration is complete
   * More balancers properly consider object migratability and PE
     availability and speed
+  * Instrumentation records multicasts
 
-- Array options
+- Chare arrays support options that can enable some optimizations
+
+- New 'completion detection' library for parallel process termination
+  detection, when the need for modularity excludes full quiescence
+  detection
+
+- New 'mesh streamer' library for fine-grain many-to-many collectives,
+  handling message bundling and network topology
+
+- Memory pooling allocator performance and resource usage improved
+  substantially
+
+- AMPI: More routines support MPI_IN_PLACE, and those that don't check
+  for it
 
 ================================================================================
 What's new in Charm++ 6.2.1 (since 6.2.0)
diff --git a/README b/README
index 0be12f5c37d274b2329935bb031cd61241aa9632..d601d08a6657f77447eb950d0012eaccca277b19 100644 (file)
--- a/README
+++ b/README
@@ -60,9 +60,9 @@ the build script will compile Charm++ under this directory.
 
 For example, on an ordinary Linux PC:
 
-   ./build charm++ net-linux
+   ./build charm++ net-linux-x86_64
 
-will build charm++ in the directory: net-linux/.  The communication
+will build charm++ in the directory: net-linux-x86_64/. The communication
 defaults to UDP packets and the compiler to gcc.
 
 For a more complex example, on a Scyld workstation with the Intel C++ 
@@ -103,10 +103,6 @@ mpi-linux            PC Linux       MPI           GNU compiler
 mpi-ppc-darwin       MacOS X        MPI           GNU C++ compiler
 mpi-linux-ia64       IA64 Linux     MPI           GNU compiler
 mpi-linux-x86_64     Opteron Linux  MPI           GNU compiler
-mpi-axp              Alpha          MPI           GNU compiler
-mpi-linux-axp        Alpha Linux    MPI           GNU compiler
-origin2000           Origin2000  shared-mem       SGI C++ compiler
-t3e                  Cray T3E    shared-mem       Cray C++ compiler
 
 
 To choose <version>, your choice is determined by three options:
@@ -121,7 +117,7 @@ development and testing.
        "mpi-" Charm++ communicates using MPI calls.  Use this for
 machines with a good MPI implementation (such as the Origin 2000).
 
-       "gemini_gni-", "bluegene[lpq]-", Charm++
+       "gemini_gni-", "bluegene[lp]-", "pami-bluegeneq-" Charm++
 communicates using direct calls to the machine's communication primitives.
 
        "multicore-" Charm++ communicates using shared memory within a
@@ -194,7 +190,7 @@ need to choose <options> from the following list:
         performance. Try your application to decide if enabling smp mode 
         improves performance.
 
-* bluegene - compile Charm++ as running on Blue Gene emulator.
+* bigsim - compile Charm++ as running on the BigSim emulator.
 * help - show supported options for a version. For example, for net-linux, 
          running:
          > ./build charm++ net-linux help
@@ -229,12 +225,12 @@ Common compile time options such as -g, -O, -Ipath, -Lpath, -llib are
 accepted.
 
 For example, on a Linux machine, you would run
-     > ./build charm++ net-linux -O
+     > ./build charm++ net-linux-x86_64 -O
 
-This will construct a net-linux directory, link over all
-the Charm++ source code into net-linux/tmp, build the entire
-Charm++ runtime system in net-linux/tmp, and link sample programs 
-into net-linux/pgms.
+This will construct a net-linux-x86_64 directory, link over all
+the Charm++ source code into net-linux-x86_64/tmp, build the entire
+Charm++ runtime system in net-linux-x86_64/tmp, and link example programs
+into net-linux-x86_64/examples.
 
 Several #define's control the compilation of Charm++.  Some of these
 #define's can be found in src/<version>/conv-mach.h.  #define's can
@@ -242,15 +238,17 @@ also be specified on the command line, using the -D option.  For
 example,
     > ./build charm++ net-linux -O -DCMK_OPTIMIZE=1
 
-CMK_OPTIMIZE: Turn on optimizations in Charm++/Converse. This disables most of
-the run-time checking performed by Converse and Charm++ runtime. This option
-should be used only after the program has been debugged. Also, this option
-disables Converse/Charm++ tracing mechanisms such as projections and summary.
+Production optimizations: Pass the configure option --with-production
+to ./build to turn on optimizations in Charm++/Converse. This disables
+most of the run-time checking performed by Converse and Charm++
+runtime. This option should be used only after the program has been
+debugged. Also, this option disables Converse/Charm++ tracing
+mechanisms such as projections and summary.
 
 When Charm++ is built successfully, the diretory structure under the
 target directory will look like:
 
-net-linux/
+net-linux-x86_64/
    |
    ---  bin/                   # all executables
    |
index e9fd298308f88e2ba3a23efa8f13837d9e04384c..c7e54b725ba69ce958e1d9f7a51a5df80b0f941e 100644 (file)
@@ -13,7 +13,7 @@ test:
 
 bgtest:
        for d in charm++ ampi fem armci; do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index ec3f8fd450ead53197cb0185019c334747db7505..d60eac6bdda43f30c0ed963e9499a3446177ce44 100644 (file)
@@ -13,7 +13,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index 0271085842b62695eef8090bf019f648c004c6fd..e5b86ea6229aee6b9d650b9e73b11b061844fb33 100755 (executable)
@@ -12,7 +12,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index cbb7b13c57d223cc62391d5ef8905e9c91b83a06..e0dd46b2798cc909c0a313ae36984e029f7008bf 100755 (executable)
@@ -12,7 +12,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index d281059658b31ccd9e4a6ebca841a80ebcf03ff1..31008f1240296627bef232231d0bcc6b1c41cf89 100644 (file)
@@ -18,7 +18,7 @@ test:
 
 bgtest:
        for d in sdag; do \
-               (cd $$d; $(MAKE) test OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) test OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index e088ae89f219bcba859441fbc88c2bf6f3e23650..23bf0aca8401e719ea086f6fc23ef3ffde61a207 100644 (file)
@@ -12,7 +12,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-                (cd $$d; $(MAKE) test OPTS='$(OPTS)' || exit 1) || exit 1; \
+                (cd $$d; $(MAKE) test OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
         done
 
 clean:
index 08fd8fc6036c290b9f6d3be57dd06d78a1c548da..d71ccd486f1e00f84925b9cf64dafd138380c6b5 100644 (file)
@@ -44,7 +44,7 @@ bglog: jacobi
 #      ./charmrun +p4 ./jacobi 16 10 4 +x2 +y2 +z2 +cth1 +wth1 +bglog
 
 bgtest: jacobi
-       ./charmrun +p4 ./jacobi 64 10 32 +bgconfig ./bg_config +bgstacksize 102400
+       ./charmrun +p4 ./jacobi 64 10 32 +bgconfig ./bg_config +bgstacksize 102400 $(TESTOPTS)
 
 clean:
        rm -f *.sts *.log C *.o *.def.h *.decl.h *~ jacobi bgTraceFile
index 3573d1f97051163d2202a76b07c0bcf784b6d08b..f3bd0d7624d55beffd25947ef9dc92010469a682 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun ./AllReduce 20 100000 +p4 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun ./AllReduce 20 100000 +p4 +x2 +y2 +z2
+       ./charmrun ./AllReduce 20 100000 +p4 +x2 +y2 +z2 $(TESTOPTS)
index 3786b0230c8e44c8362e2f5d6ca0e86b41b72026..230885443922bfc73b12cace181eacbec7884f8e 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun ./AllReduce 20 100000 +p4 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun ./AllReduce 20 100000 +p4 +x2 +y2 +z2
+       ./charmrun ./AllReduce 20 100000 +p4 +x2 +y2 +z2 $(TESTOPTS)
index 1a43ea04b029b48cc4e674ab5d49c3dba0a2e2ef..271b309427bcefcecd95a1765b63117588b0f395 100644 (file)
@@ -13,7 +13,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index cca7bf8a9f4b14367aa703c332a33f2067200958..88735e563069eb1419e663ea74b7b92a9edfea4b 100644 (file)
@@ -23,4 +23,4 @@ test: all
        ./charmrun +p12 pgm 3 2 2 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun pgm +p4 10 +x2 +y2 +z1
+       ./charmrun pgm +p4 10 +x2 +y2 +z1 $(TESTOPTS)
index b694cdf9e3fb71a21bfa1ea094b4a22e0da09aac..f8cc2c59186820bfbea063fe6d633b18eb735e60 100644 (file)
@@ -23,4 +23,4 @@ test: all
        ./charmrun +p12 pgm 3 2 2 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun pgm +p4 10 +x2 +y2 +z1
+       ./charmrun pgm +p4 10 +x2 +y2 +z1 $(TESTOPTS)
index 38bd73d5709b5049419cf6e672b07c643fb13d95..29b84e8b6ba4c3293b8e9dc8457ed34c9c8a6f7a 100644 (file)
@@ -7,7 +7,7 @@ all: pgm pgm.prj
 pgm: $(OBJS)
        $(CHARMC) -language charm++ -o pgm $(OBJS) 
 #$(CHARMC) -language charm++ -o pgm $(OBJS) -module CkMulticast
-       
+
 
 pgm.prj: $(OBJS)
        $(CHARMC) -language charm++ -tracemode projections -o pgm.prj $(OBJS) 
@@ -26,4 +26,4 @@ test: all
        ./charmrun +p12 pgm 3 2 2 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun pgm +p4 10 +x2 +y2 +z1
+       ./charmrun pgm +p4 10 +x2 +y2 +z1 $(TESTOPTS)
index 3f44bf3dd16e6984b0b779764aaecb7f3fd09e3a..4cb29fc96f749eeb4b2ba53cd71002c74daee2bb 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun ./RedExample 20 4.5 10.3 +p4 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun ./RedExample 20 4.5 10.3 +p4 +x2 +y2 +z2
+       ./charmrun ./RedExample 20 4.5 10.3 +p4 +x2 +y2 +z2 $(TESTOPTS)
index e6fa0af8e3d0bfce28becdee26c51120fa1bc506..a84211adb774d8cd3fab83d396b7dd4ce24ed761 100644 (file)
@@ -16,4 +16,4 @@ X10_lib.o : X10_lib.C X10_lib.decl.h
 
 
 clean : 
-       rm -f a.out *~  X10_test2 charmrun X10_lib.o X10_lib.decl.h X10_lib.def.h
+       rm -f a.out *~  X10_test2 charmrun X10_lib.o *.decl.h *.def.h
diff --git a/examples/charm++/X10/X10_test.decl.h b/examples/charm++/X10/X10_test.decl.h
deleted file mode 100644 (file)
index 6a73d18..0000000
+++ /dev/null
@@ -1,300 +0,0 @@
-#ifndef _DECL_X10_test_H_
-#define _DECL_X10_test_H_
-#include "charm++.h"
-/* DECLS: message asyncMsg;
- */
-class asyncMsg;
-class CMessage_asyncMsg:public CkMessage{
-  public:
-    static int __idx;
-    void* operator new(size_t, void*p) { return p; }
-    void* operator new(size_t);
-    void* operator new(size_t, int*, const int);
-    void* operator new(size_t, int*);
-#if CMK_MULTIPLE_DELETE
-    void operator delete(void*p, void*){CkFreeMsg(p);}
-    void operator delete(void*p){ CkFreeMsg(p);}
-    void operator delete(void*p, int*, const int){CkFreeMsg(p);}
-    void operator delete(void*p, int*){CkFreeMsg(p);}
-#endif
-    void operator delete(void*p, size_t){CkFreeMsg(p);}
-    static void* alloc(int,size_t, int*, int);
-    CMessage_asyncMsg() {};
-    static void *pack(asyncMsg *p);
-    static asyncMsg* unpack(void* p);
-    void *operator new(size_t, const int);
-#if CMK_MULTIPLE_DELETE
-    void operator delete(void *p, const int){CkFreeMsg(p);}
-#endif
-    static void __register(const char *s, size_t size, CkPackFnPtr pack, CkUnpackFnPtr unpack) {
-      __idx = CkRegisterMsg(s, pack, unpack, size);
-    }
-};
-
-/* DECLS: readonly CProxy_Main mainProxy;
- */
-
-/* DECLS: readonly int nPlaces;
- */
-
-/* DECLS: readonly CProxy_Places placesProxy;
- */
-
-/* DECLS: mainchare Main: Chare{
-Main(CkArgMsg* impl_msg);
-threaded void libThread(void);
-};
- */
- class Main;
- class CkIndex_Main;
- class CProxy_Main;
-/* --------------- index object ------------------ */
-class CkIndex_Main:public CProxy_Chare{
-  public:
-    typedef Main local_t;
-    typedef CkIndex_Main index_t;
-    typedef CProxy_Main proxy_t;
-    typedef CProxy_Main element_t;
-
-    static int __idx;
-    static void __register(const char *s, size_t size);
-/* DECLS: Main(CkArgMsg* impl_msg);
- */
-    static int __idx_Main_CkArgMsg;
-    static int ckNew(CkArgMsg* impl_msg) { return __idx_Main_CkArgMsg; }
-    static void _call_Main_CkArgMsg(void* impl_msg,Main* impl_obj);
-
-/* DECLS: threaded void libThread(void);
- */
-    static int __idx_libThread_void;
-    static int libThread(void) { return __idx_libThread_void; }
-    static void _call_libThread_void(void* impl_msg,Main* impl_obj);
-    static void _callthr_libThread_void(CkThrCallArg *);
-};
-/* --------------- element proxy ------------------ */
-class CProxy_Main:public CProxy_Chare{
-  public:
-    typedef Main local_t;
-    typedef CkIndex_Main index_t;
-    typedef CProxy_Main proxy_t;
-    typedef CProxy_Main element_t;
-
-    CProxy_Main(void) {};
-    CProxy_Main(CkChareID __cid) : CProxy_Chare(__cid){  }
-    CProxy_Main(const Chare *c) : CProxy_Chare(c){  }
-    CK_DISAMBIG_CHARE(CProxy_Chare)
-    void ckDelegate(CkDelegateMgr *dTo,CkDelegateData *dPtr=NULL) {
-      CProxy_Chare::ckDelegate(dTo,dPtr);
-    }
-    void ckUndelegate(void) {
-      CProxy_Chare::ckUndelegate();
-    }
-    void pup(PUP::er &p) {
-      CProxy_Chare::pup(p);
-    }
-    void ckSetChareID(const CkChareID &c) {
-      CProxy_Chare::ckSetChareID(c);
-    }
-    Main *ckLocal(void) const
-     { return (Main *)CkLocalChare(&ckGetChareID()); }
-/* DECLS: Main(CkArgMsg* impl_msg);
- */
-    static CkChareID ckNew(CkArgMsg* impl_msg, int onPE=CK_PE_ANY);
-    static void ckNew(CkArgMsg* impl_msg, CkChareID* pcid, int onPE=CK_PE_ANY);
-    CProxy_Main(CkArgMsg* impl_msg, int onPE=CK_PE_ANY);
-
-/* DECLS: threaded void libThread(void);
- */
-    void libThread(void);
-};
-PUPmarshall(CProxy_Main);
-typedef CBaseT<Chare,CProxy_Main>  CBase_Main;
-
-/* DECLS: array Places: ArrayElement{
-Places(CkMigrateMessage* impl_msg);
-void Places(void);
-threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
-};
- */
- class Places;
- class CkIndex_Places;
- class CProxy_Places;
- class CProxyElement_Places;
- class CProxySection_Places;
-/* --------------- index object ------------------ */
-class CkIndex_Places:public CProxyElement_ArrayElement{
-  public:
-    typedef Places local_t;
-    typedef CkIndex_Places index_t;
-    typedef CProxy_Places proxy_t;
-    typedef CProxyElement_Places element_t;
-    typedef CProxySection_Places section_t;
-
-    static int __idx;
-    static void __register(const char *s, size_t size);
-/* DECLS: Places(CkMigrateMessage* impl_msg);
- */
-    static int __idx_Places_CkMigrateMessage;
-    static int ckNew(CkMigrateMessage* impl_msg) { return __idx_Places_CkMigrateMessage; }
-    static void _call_Places_CkMigrateMessage(void* impl_msg,Places* impl_obj);
-
-/* DECLS: void Places(void);
- */
-    static int __idx_Places_void;
-    static int ckNew(void) { return __idx_Places_void; }
-    static void _call_Places_void(void* impl_msg,Places* impl_obj);
-
-/* DECLS: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
- */
-    static int __idx_startAsync_marshall2;
-    static int startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src) { return __idx_startAsync_marshall2; }
-    static void _call_startAsync_marshall2(void* impl_msg,Places* impl_obj);
-    static void _callthr_startAsync_marshall2(CkThrCallArg *);
-    static void _marshallmessagepup_startAsync_marshall2(PUP::er &p,void *msg);
-};
-/* --------------- element proxy ------------------ */
- class CProxyElement_Places : public CProxyElement_ArrayElement{
-  public:
-    typedef Places local_t;
-    typedef CkIndex_Places index_t;
-    typedef CProxy_Places proxy_t;
-    typedef CProxyElement_Places element_t;
-    typedef CProxySection_Places section_t;
-
-    CProxyElement_Places(void) {}
-    CProxyElement_Places(const ArrayElement *e) : CProxyElement_ArrayElement(e){  }
-    void ckDelegate(CkDelegateMgr *dTo,CkDelegateData *dPtr=NULL) {
-      CProxyElement_ArrayElement::ckDelegate(dTo,dPtr);
-    }
-    void ckUndelegate(void) {
-      CProxyElement_ArrayElement::ckUndelegate();
-    }
-    void pup(PUP::er &p) {
-      CProxyElement_ArrayElement::pup(p);
-    }
-    CK_DISAMBIG_ARRAY_ELEMENT(CProxyElement_ArrayElement)
-    Places *ckLocal(void) const
-      { return (Places *)CProxyElement_ArrayElement::ckLocal(); }
-    CProxyElement_Places(const CkArrayID &aid,const CkArrayIndex1D &idx,CK_DELCTOR_PARAM)
-        :CProxyElement_ArrayElement(aid,idx,CK_DELCTOR_ARGS) {}
-    CProxyElement_Places(const CkArrayID &aid,const CkArrayIndex1D &idx)
-        :CProxyElement_ArrayElement(aid,idx) {}
-/* DECLS: Places(CkMigrateMessage* impl_msg);
- */
-
-/* DECLS: void Places(void);
- */
-    void insert(int onPE=-1);
-/* DECLS: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
- */
-    void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src, const CkEntryOptions *impl_e_opts=NULL) ;
-};
-PUPmarshall(CProxyElement_Places);
-/* ---------------- collective proxy -------------- */
- class CProxy_Places : public CProxy_ArrayElement{
-  public:
-    typedef Places local_t;
-    typedef CkIndex_Places index_t;
-    typedef CProxy_Places proxy_t;
-    typedef CProxyElement_Places element_t;
-    typedef CProxySection_Places section_t;
-
-    CProxy_Places(void) {}
-    CProxy_Places(const ArrayElement *e) : CProxy_ArrayElement(e){  }
-    void ckDelegate(CkDelegateMgr *dTo,CkDelegateData *dPtr=NULL) {
-      CProxy_ArrayElement::ckDelegate(dTo,dPtr);
-    }
-    void ckUndelegate(void) {
-      CProxy_ArrayElement::ckUndelegate();
-    }
-    void pup(PUP::er &p) {
-      CProxy_ArrayElement::pup(p);
-    }
-    CK_DISAMBIG_ARRAY(CProxy_ArrayElement)
-    static CkArrayID ckNew(void) {return ckCreateEmptyArray();}
-//Generalized array indexing:
-    CProxyElement_Places operator [] (const CkArrayIndex1D &idx) const
-        {return CProxyElement_Places(ckGetArrayID(), idx, CK_DELCTOR_CALL);}
-    CProxyElement_Places operator() (const CkArrayIndex1D &idx) const
-        {return CProxyElement_Places(ckGetArrayID(), idx, CK_DELCTOR_CALL);}
-    CProxyElement_Places operator [] (int idx) const 
-        {return CProxyElement_Places(ckGetArrayID(), CkArrayIndex1D(idx), CK_DELCTOR_CALL);}
-    CProxyElement_Places operator () (int idx) const 
-        {return CProxyElement_Places(ckGetArrayID(), CkArrayIndex1D(idx), CK_DELCTOR_CALL);}
-    CProxy_Places(const CkArrayID &aid,CK_DELCTOR_PARAM) 
-        :CProxy_ArrayElement(aid,CK_DELCTOR_ARGS) {}
-    CProxy_Places(const CkArrayID &aid) 
-        :CProxy_ArrayElement(aid) {}
-/* DECLS: Places(CkMigrateMessage* impl_msg);
- */
-
-/* DECLS: void Places(void);
- */
-    static CkArrayID ckNew(const CkArrayOptions &opts);
-
-/* DECLS: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
- */
-    void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src, const CkEntryOptions *impl_e_opts=NULL) ;
-};
-PUPmarshall(CProxy_Places);
-/* ---------------- section proxy -------------- */
- class CProxySection_Places : public CProxySection_ArrayElement{
-  public:
-    typedef Places local_t;
-    typedef CkIndex_Places index_t;
-    typedef CProxy_Places proxy_t;
-    typedef CProxyElement_Places element_t;
-    typedef CProxySection_Places section_t;
-
-    CProxySection_Places(void) {}
-    void ckDelegate(CkDelegateMgr *dTo,CkDelegateData *dPtr=NULL) {
-      CProxySection_ArrayElement::ckDelegate(dTo,dPtr);
-    }
-    void ckUndelegate(void) {
-      CProxySection_ArrayElement::ckUndelegate();
-    }
-    void pup(PUP::er &p) {
-      CProxySection_ArrayElement::pup(p);
-    }
-    CK_DISAMBIG_ARRAY_SECTION(CProxySection_ArrayElement)
-//Generalized array indexing:
-    CProxyElement_Places operator [] (const CkArrayIndex1D &idx) const
-        {return CProxyElement_Places(ckGetArrayID(), idx, CK_DELCTOR_CALL);}
-    CProxyElement_Places operator() (const CkArrayIndex1D &idx) const
-        {return CProxyElement_Places(ckGetArrayID(), idx, CK_DELCTOR_CALL);}
-    CProxyElement_Places operator [] (int idx) const 
-        {return CProxyElement_Places(ckGetArrayID(), *(CkArrayIndex1D*)&ckGetArrayElements()[idx], CK_DELCTOR_CALL);}
-    CProxyElement_Places operator () (int idx) const 
-        {return CProxyElement_Places(ckGetArrayID(), *(CkArrayIndex1D*)&ckGetArrayElements()[idx], CK_DELCTOR_CALL);}
-    static CkSectionID ckNew(const CkArrayID &aid, CkArrayIndex1D *elems, int nElems) {
-      return CkSectionID(aid, elems, nElems);
-    } 
-    static CkSectionID ckNew(const CkArrayID &aid, int l, int u, int s) {
-      CkVec<CkArrayIndex1D> al;
-      for (int i=l; i<=u; i+=s) al.push_back(CkArrayIndex1D(i));
-      return CkSectionID(aid, al.getVec(), al.size());
-    } 
-    CProxySection_Places(const CkArrayID &aid, CkArrayIndex *elems, int nElems, CK_DELCTOR_PARAM) 
-        :CProxySection_ArrayElement(aid,elems,nElems,CK_DELCTOR_ARGS) {}
-    CProxySection_Places(const CkArrayID &aid, CkArrayIndex *elems, int nElems) 
-        :CProxySection_ArrayElement(aid,elems,nElems) {}
-    CProxySection_Places(const CkSectionID &sid)       :CProxySection_ArrayElement(sid) {}
-    static CkSectionID ckNew(const CkArrayID &aid, CkArrayIndex *elems, int nElems) {
-      return CkSectionID(aid, elems, nElems);
-    } 
-/* DECLS: Places(CkMigrateMessage* impl_msg);
- */
-
-/* DECLS: void Places(void);
- */
-
-/* DECLS: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
- */
-    void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src, const CkEntryOptions *impl_e_opts=NULL) ;
-};
-PUPmarshall(CProxySection_Places);
-typedef CBaseT<ArrayElementT<CkIndex1D>,CProxy_Places>  CBase_Places;
-
-extern void _registerX10_test(void);
-extern "C" void CkRegisterMainModule(void);
-#endif
diff --git a/examples/charm++/X10/X10_test.def.h b/examples/charm++/X10/X10_test.def.h
deleted file mode 100644 (file)
index 6994502..0000000
+++ /dev/null
@@ -1,362 +0,0 @@
-/* DEFS: message asyncMsg;
- */
-#ifndef CK_TEMPLATES_ONLY
-void *CMessage_asyncMsg::operator new(size_t s){
-  return asyncMsg::alloc(__idx, s, 0, 0);
-}
-void *CMessage_asyncMsg::operator new(size_t s, int* sz){
-  return asyncMsg::alloc(__idx, s, sz, 0);
-}
-void *CMessage_asyncMsg::operator new(size_t s, int* sz,const int pb){
-  return asyncMsg::alloc(__idx, s, sz, pb);
-}
-void *CMessage_asyncMsg::operator new(size_t s, const int p) {
-  return asyncMsg::alloc(__idx, s, 0, p);
-}
-void* CMessage_asyncMsg::alloc(int msgnum, size_t sz, int *sizes, int pb) {
-  int offsets[1];
-  offsets[0] = ALIGN8(sz);
-  asyncMsg *newmsg = (asyncMsg *) CkAllocMsg(msgnum, offsets[0], pb);
-  return (void *) newmsg;
-}
-void* CMessage_asyncMsg::pack(asyncMsg *msg) {
-  return (void *) msg;
-}
-asyncMsg* CMessage_asyncMsg::unpack(void* buf) {
-  asyncMsg *msg = (asyncMsg *) buf;
-  return msg;
-}
-int CMessage_asyncMsg::__idx=0;
-#endif
-
-/* DEFS: readonly CProxy_Main mainProxy;
- */
-extern CProxy_Main mainProxy;
-#ifndef CK_TEMPLATES_ONLY
-extern "C" void __xlater_roPup_mainProxy(void *_impl_pup_er) {
-  PUP::er &_impl_p=*(PUP::er *)_impl_pup_er;
-  _impl_p|mainProxy;
-}
-#endif
-
-/* DEFS: readonly int nPlaces;
- */
-extern int nPlaces;
-#ifndef CK_TEMPLATES_ONLY
-extern "C" void __xlater_roPup_nPlaces(void *_impl_pup_er) {
-  PUP::er &_impl_p=*(PUP::er *)_impl_pup_er;
-  _impl_p|nPlaces;
-}
-#endif
-
-/* DEFS: readonly CProxy_Places placesProxy;
- */
-extern CProxy_Places placesProxy;
-#ifndef CK_TEMPLATES_ONLY
-extern "C" void __xlater_roPup_placesProxy(void *_impl_pup_er) {
-  PUP::er &_impl_p=*(PUP::er *)_impl_pup_er;
-  _impl_p|placesProxy;
-}
-#endif
-
-/* DEFS: mainchare Main: Chare{
-Main(CkArgMsg* impl_msg);
-threaded void libThread(void);
-};
- */
-#ifndef CK_TEMPLATES_ONLY
- int CkIndex_Main::__idx=0;
-#endif
-#ifndef CK_TEMPLATES_ONLY
-/* DEFS: Main(CkArgMsg* impl_msg);
- */
-CkChareID CProxy_Main::ckNew(CkArgMsg* impl_msg, int impl_onPE)
-{
-  CkChareID impl_ret;
-  CkCreateChare(CkIndex_Main::__idx, CkIndex_Main::__idx_Main_CkArgMsg, impl_msg, &impl_ret, impl_onPE);
-  return impl_ret;
-}
-void CProxy_Main::ckNew(CkArgMsg* impl_msg, CkChareID* pcid, int impl_onPE)
-{
-  CkCreateChare(CkIndex_Main::__idx, CkIndex_Main::__idx_Main_CkArgMsg, impl_msg, pcid, impl_onPE);
-}
-  CProxy_Main::CProxy_Main(CkArgMsg* impl_msg, int impl_onPE)
-{
-  CkChareID impl_ret;
-  CkCreateChare(CkIndex_Main::__idx, CkIndex_Main::__idx_Main_CkArgMsg, impl_msg, &impl_ret, impl_onPE);
-  ckSetChareID(impl_ret);
-}
- int CkIndex_Main::__idx_Main_CkArgMsg=0;
-void CkIndex_Main::_call_Main_CkArgMsg(void* impl_msg,Main * impl_obj)
-{
-  new (impl_obj) Main((CkArgMsg*)impl_msg);
-}
-
-/* DEFS: threaded void libThread(void);
- */
-void CProxy_Main::libThread(void)
-{
-  ckCheck();
-  void *impl_msg = CkAllocSysMsg();
-  if (ckIsDelegated()) {
-    int destPE=CkChareMsgPrep(CkIndex_Main::__idx_libThread_void, impl_msg, &ckGetChareID());
-    if (destPE!=-1) ckDelegatedTo()->ChareSend(ckDelegatedPtr(),CkIndex_Main::__idx_libThread_void, impl_msg, &ckGetChareID(),destPE);
-  }
-  else CkSendMsg(CkIndex_Main::__idx_libThread_void, impl_msg, &ckGetChareID(),0);
-}
- int CkIndex_Main::__idx_libThread_void=0;
-void CkIndex_Main::_call_libThread_void(void* impl_msg,Main * impl_obj)
-{
-  CthThread tid = CthCreate((CthVoidFn)_callthr_libThread_void, new CkThrCallArg(impl_msg,impl_obj), 0);
-  ((Chare *)impl_obj)->CkAddThreadListeners(tid,impl_msg);
-  CthAwaken(tid);
-}
-void CkIndex_Main::_callthr_libThread_void(CkThrCallArg *impl_arg)
-{
-  void *impl_msg = impl_arg->msg;
-  Main *impl_obj = (Main *) impl_arg->obj;
-  delete impl_arg;
-  CkFreeSysMsg(impl_msg);
-  impl_obj->libThread();
-}
-#endif /*CK_TEMPLATES_ONLY*/
-#ifndef CK_TEMPLATES_ONLY
-void CkIndex_Main::__register(const char *s, size_t size) {
-  __idx = CkRegisterChare(s, size);
-  CkRegisterBase(__idx, CkIndex_Chare::__idx);
-// REG: Main(CkArgMsg* impl_msg);
-  __idx_Main_CkArgMsg = CkRegisterEp("Main(CkArgMsg* impl_msg)",
-     (CkCallFnPtr)_call_Main_CkArgMsg, CMessage_CkArgMsg::__idx, __idx, 0);
-  CkRegisterMainChare(__idx, __idx_Main_CkArgMsg);
-
-// REG: threaded void libThread(void);
-  __idx_libThread_void = CkRegisterEp("libThread(void)",
-     (CkCallFnPtr)_call_libThread_void, 0, __idx, 0);
-}
-#endif
-
-/* DEFS: array Places: ArrayElement{
-Places(CkMigrateMessage* impl_msg);
-void Places(void);
-threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
-};
- */
-#ifndef CK_TEMPLATES_ONLY
- int CkIndex_Places::__idx=0;
-#endif
-#ifndef CK_TEMPLATES_ONLY
-/* DEFS: Places(CkMigrateMessage* impl_msg);
- */
-
-/* DEFS: void Places(void);
- */
-void CProxyElement_Places::insert(int onPE)
-{ 
-  void *impl_msg = CkAllocSysMsg();
-   ckInsert((CkArrayMessage *)impl_msg,CkIndex_Places::__idx_Places_void,onPE);
-}
-
-/* DEFS: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
- */
-void CProxyElement_Places::startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src, const CkEntryOptions *impl_e_opts) 
-{
-  ckCheck();
-  //Marshall: int which_statement, const CkFutureID &ftHandle, int pe_src
-  int impl_off=0;
-  { //Find the size of the PUP'd data
-    PUP::sizer implP;
-    implP|which_statement;
-    //Have to cast away const-ness to get pup routine
-    implP|(CkFutureID &)ftHandle;
-    implP|pe_src;
-    impl_off+=implP.size();
-  }
-  CkMarshallMsg *impl_msg=CkAllocateMarshallMsg(impl_off,impl_e_opts);
-  { //Copy over the PUP'd data
-    PUP::toMem implP((void *)impl_msg->msgBuf);
-    implP|which_statement;
-    //Have to cast away const-ness to get pup routine
-    implP|(CkFutureID &)ftHandle;
-    implP|pe_src;
-  }
-  CkArrayMessage *impl_amsg=(CkArrayMessage *)impl_msg;
-  impl_amsg->array_setIfNotThere(CkArray_IfNotThere_buffer);
-  ckSend(impl_amsg, CkIndex_Places::__idx_startAsync_marshall2,0);
-}
-/* DEFS: Places(CkMigrateMessage* impl_msg);
- */
- int CkIndex_Places::__idx_Places_CkMigrateMessage=0;
-void CkIndex_Places::_call_Places_CkMigrateMessage(void* impl_msg,Places * impl_obj)
-{
-  new (impl_obj) Places((CkMigrateMessage*)impl_msg);
-}
-
-/* DEFS: void Places(void);
- */
-CkArrayID CProxy_Places::ckNew(const CkArrayOptions &opts)
-{ 
-  void *impl_msg = CkAllocSysMsg();
-   return ckCreateArray((CkArrayMessage *)impl_msg,CkIndex_Places::__idx_Places_void,opts);
-}
- int CkIndex_Places::__idx_Places_void=0;
-void CkIndex_Places::_call_Places_void(void* impl_msg,Places * impl_obj)
-{
-  CkFreeSysMsg(impl_msg);
-  new (impl_obj) Places();
-}
-
-/* DEFS: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
- */
-void CProxy_Places::startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src, const CkEntryOptions *impl_e_opts) 
-{
-  ckCheck();
-  //Marshall: int which_statement, const CkFutureID &ftHandle, int pe_src
-  int impl_off=0;
-  { //Find the size of the PUP'd data
-    PUP::sizer implP;
-    implP|which_statement;
-    //Have to cast away const-ness to get pup routine
-    implP|(CkFutureID &)ftHandle;
-    implP|pe_src;
-    impl_off+=implP.size();
-  }
-  CkMarshallMsg *impl_msg=CkAllocateMarshallMsg(impl_off,impl_e_opts);
-  { //Copy over the PUP'd data
-    PUP::toMem implP((void *)impl_msg->msgBuf);
-    implP|which_statement;
-    //Have to cast away const-ness to get pup routine
-    implP|(CkFutureID &)ftHandle;
-    implP|pe_src;
-  }
-  CkArrayMessage *impl_amsg=(CkArrayMessage *)impl_msg;
-  impl_amsg->array_setIfNotThere(CkArray_IfNotThere_buffer);
-  ckBroadcast(impl_amsg, CkIndex_Places::__idx_startAsync_marshall2,0);
-}
- int CkIndex_Places::__idx_startAsync_marshall2=0;
-void CkIndex_Places::_call_startAsync_marshall2(void* impl_msg,Places * impl_obj)
-{
-  CthThread tid = CthCreate((CthVoidFn)_callthr_startAsync_marshall2, new CkThrCallArg(impl_msg,impl_obj), 0);
-  ((Chare *)impl_obj)->CkAddThreadListeners(tid,impl_msg);
-  CthAwaken(tid);
-}
-void CkIndex_Places::_callthr_startAsync_marshall2(CkThrCallArg *impl_arg)
-{
-  void *impl_msg = impl_arg->msg;
-  Places *impl_obj = (Places *) impl_arg->obj;
-  delete impl_arg;
-  char *impl_buf=((CkMarshallMsg *)impl_msg)->msgBuf;
-  /*Unmarshall pup'd fields: int which_statement, const CkFutureID &ftHandle, int pe_src*/
-  PUP::fromMem implP(impl_buf);
-  int which_statement; implP|which_statement;
-  CkFutureID ftHandle; implP|ftHandle;
-  int pe_src; implP|pe_src;
-  impl_buf+=CK_ALIGN(implP.size(),16);
-  /*Unmarshall arrays:*/
-  impl_obj->startAsync(which_statement, ftHandle, pe_src);
-  delete (CkMarshallMsg *)impl_msg;
-}
-void CkIndex_Places::_marshallmessagepup_startAsync_marshall2(PUP::er &implDestP,void *impl_msg) {
-  char *impl_buf=((CkMarshallMsg *)impl_msg)->msgBuf;
-  /*Unmarshall pup'd fields: int which_statement, const CkFutureID &ftHandle, int pe_src*/
-  PUP::fromMem implP(impl_buf);
-  int which_statement; implP|which_statement;
-  CkFutureID ftHandle; implP|ftHandle;
-  int pe_src; implP|pe_src;
-  impl_buf+=CK_ALIGN(implP.size(),16);
-  /*Unmarshall arrays:*/
-  if (implDestP.hasComments()) implDestP.comment("which_statement");
-  implDestP|which_statement;
-  if (implDestP.hasComments()) implDestP.comment("ftHandle");
-  implDestP|ftHandle;
-  if (implDestP.hasComments()) implDestP.comment("pe_src");
-  implDestP|pe_src;
-}
-/* DEFS: Places(CkMigrateMessage* impl_msg);
- */
-
-/* DEFS: void Places(void);
- */
-
-/* DEFS: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
- */
-void CProxySection_Places::startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src, const CkEntryOptions *impl_e_opts) 
-{
-  ckCheck();
-  //Marshall: int which_statement, const CkFutureID &ftHandle, int pe_src
-  int impl_off=0;
-  { //Find the size of the PUP'd data
-    PUP::sizer implP;
-    implP|which_statement;
-    //Have to cast away const-ness to get pup routine
-    implP|(CkFutureID &)ftHandle;
-    implP|pe_src;
-    impl_off+=implP.size();
-  }
-  CkMarshallMsg *impl_msg=CkAllocateMarshallMsg(impl_off,impl_e_opts);
-  { //Copy over the PUP'd data
-    PUP::toMem implP((void *)impl_msg->msgBuf);
-    implP|which_statement;
-    //Have to cast away const-ness to get pup routine
-    implP|(CkFutureID &)ftHandle;
-    implP|pe_src;
-  }
-  CkArrayMessage *impl_amsg=(CkArrayMessage *)impl_msg;
-  impl_amsg->array_setIfNotThere(CkArray_IfNotThere_buffer);
-  ckSend(impl_amsg, CkIndex_Places::__idx_startAsync_marshall2,0);
-}
-#endif /*CK_TEMPLATES_ONLY*/
-#ifndef CK_TEMPLATES_ONLY
-void CkIndex_Places::__register(const char *s, size_t size) {
-  __idx = CkRegisterChare(s, size);
-  CkRegisterBase(__idx, CkIndex_ArrayElement::__idx);
-// REG: Places(CkMigrateMessage* impl_msg);
-  __idx_Places_CkMigrateMessage = CkRegisterEp("Places(CkMigrateMessage* impl_msg)",
-     (CkCallFnPtr)_call_Places_CkMigrateMessage, 0, __idx, 0);
-  CkRegisterMigCtor(__idx, __idx_Places_CkMigrateMessage);
-
-// REG: void Places(void);
-  __idx_Places_void = CkRegisterEp("Places(void)",
-     (CkCallFnPtr)_call_Places_void, 0, __idx, 0);
-  CkRegisterDefaultCtor(__idx, __idx_Places_void);
-
-// REG: threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
-  __idx_startAsync_marshall2 = CkRegisterEp("startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src)",
-     (CkCallFnPtr)_call_startAsync_marshall2, CkMarshallMsg::__idx, __idx, 0);
-  CkRegisterMessagePupFn(__idx_startAsync_marshall2,(CkMessagePupFn)_marshallmessagepup_startAsync_marshall2);
-}
-#endif
-
-#ifndef CK_TEMPLATES_ONLY
-void _registerX10_test(void)
-{
-  static int _done = 0; if(_done) return; _done = 1;
-/* REG: message asyncMsg;
-*/
-CMessage_asyncMsg::__register("asyncMsg", sizeof(asyncMsg),(CkPackFnPtr) asyncMsg::pack,(CkUnpackFnPtr) asyncMsg::unpack);
-
-  CkRegisterReadonly("mainProxy","CProxy_Main",sizeof(mainProxy),(void *) &mainProxy,__xlater_roPup_mainProxy);
-
-  CkRegisterReadonly("nPlaces","int",sizeof(nPlaces),(void *) &nPlaces,__xlater_roPup_nPlaces);
-
-  CkRegisterReadonly("placesProxy","CProxy_Places",sizeof(placesProxy),(void *) &placesProxy,__xlater_roPup_placesProxy);
-
-/* REG: mainchare Main: Chare{
-Main(CkArgMsg* impl_msg);
-threaded void libThread(void);
-};
-*/
-  CkIndex_Main::__register("Main", sizeof(Main));
-
-/* REG: array Places: ArrayElement{
-Places(CkMigrateMessage* impl_msg);
-void Places(void);
-threaded void startAsync(int which_statement, const CkFutureID &ftHandle, int pe_src);
-};
-*/
-  CkIndex_Places::__register("Places", sizeof(Places));
-
-}
-extern "C" void CkRegisterMainModule(void) {
-  _registerX10_test();
-}
-#endif
index 38241dca42e0834943f3a55adcafe9abe897dddc..e011f7e89b7f3db8aba60fb225698f13f0effcf7 100644 (file)
@@ -7,12 +7,12 @@ all:
 
 test:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) test OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) test OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index ba1151633af9a2f56434ce4a5d9852f582e2465b..e0ec8c6eb228cb35c069a9096b8122001acfaaae 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun pgm +p4 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun pgm +p4 +x2 +y2 +z1
+       ./charmrun pgm +p4 +x2 +y2 +z1 $(TESTOPTS)
index 19843dff97ad1c2eac9c35f996cd3e9ae5d0b934..a2cd44ab732ba9f315cf2593090698b55eb6d73e 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun hello +p4 10 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun hello +p4 10 +x2 +y2 +z1
+       ./charmrun hello +p4 10 +x2 +y2 +z1 $(TESTOPTS)
index 2eada50804b8088f4e89da5c343b9c446e6176fb..6d54da97038d5587303f821edd8502c69741cf58 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun ./hello +p4 10 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun ./hello +p4 10 +x2 +y2 +z2
+       ./charmrun ./hello +p4 10 +x2 +y2 +z2 $(TESTOPTS)
index c70536c888e385a907b685dad36136d9858b2635..0fbdf49c9d1ac985d304f89e8dece11235134ce4 100644 (file)
@@ -15,7 +15,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index a771281b56d375f61cc77cc3d4476648272d8470..eccf5511c9024a8c496e914a8975947ac5c9bd9d 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun ./hello +p2 10 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun ./hello +p2 10 +x2 +y2 +z1
+       ./charmrun ./hello +p2 10 +x2 +y2 +z1 $(TESTOPTS)
index 2eada50804b8088f4e89da5c343b9c446e6176fb..6d54da97038d5587303f821edd8502c69741cf58 100644 (file)
@@ -20,4 +20,4 @@ test: all
        ./charmrun ./hello +p4 10 $(TESTOPTS)
 
 bgtest: all
-       ./charmrun ./hello +p4 10 +x2 +y2 +z2
+       ./charmrun ./hello +p4 10 +x2 +y2 +z2 $(TESTOPTS)
index 68c53ffb47c8afa805f8c8e08d64ba03e73e92f8..c7ce4e1b3dfad0f367b1cce457801876d8936a90 100644 (file)
@@ -18,5 +18,5 @@ test: pgm
        ./charmrun ./pgm +p2 1000 10 $(TESTOPTS)
 
 bgtest: pgm
-       ./charmrun ./pgm +p2 1000 10 +x2 +y2 +z1
+       ./charmrun ./pgm +p2 1000 10 +x2 +y2 +z1 $(TESTOPTS)
 
index c6141bb469b89cdd7983a0657726b2c07a8bf8b9..06efba17dd5f453df3513aaded52eff04180cce8 100644 (file)
@@ -10,7 +10,7 @@ test: pgm
        ./charmrun +p4 ./pgm 1000000 100 $(TESTOPTS)
 
 bgtest: pgm
-       ./charmrun +p4 ./pgm 1000000 100 +x2 +y2 +z2
+       ./charmrun +p4 ./pgm 1000000 100 +x2 +y2 +z2 $(TESTOPTS)
 
 
 # compile program with trace projections - performance trace library
index 5d2f23e47f4cf35448c5166c6909f2be9cc3f0da..77f1587fc2ab6aac94eea8045a88a761835e6e13 100644 (file)
@@ -12,7 +12,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index d4b8eeceb4953ddd7fae72f0754fca6f0da1bb8d..02c174a528ba496e54e8b3ddf402acdaca4836af 100644 (file)
@@ -9,7 +9,7 @@ test: pgm
        ./charmrun +p4 ./pgm 100000 100 $(TESTOPTS)
 
 bgtest: pgm
-       ./charmrun +p4 ./pgm 100000 100 +x2 +y2 +z1
+       ./charmrun +p4 ./pgm 100000 100 +x2 +y2 +z1 $(TESTOPTS)
 
 pgm.o : pgm.C pgm.h pgm.decl.h
        $(CHARMC) -c pgm.C
index 71c1af39303a634b6753b1f48abbe24047dc5738..8df4585aae49e62b0dcd4a0343ea4fff84c54162 100644 (file)
@@ -12,7 +12,7 @@ test: pgm
        ./charmrun +p4 ./pgm $(TESTOPTS)
 
 bgtest: pgm
-       ./charmrun +p4 ./pgm +x2 +y2 +z2
+       ./charmrun +p4 ./pgm +x2 +y2 +z2 $(TESTOPTS)
 
 clean:
        rm -f conv-host pgm *.def.h *.decl.h *.o *~ charmrun charmrun.exe pgm.exe pgm.pdb pgm.ilk
index 2ab74008f497cc248bf3051f61f2d7506e494afe..a8f3fbb1dc237ad40ea1a60c55d8e2028cd62d63 100644 (file)
@@ -12,7 +12,7 @@ test: pgm
        ./charmrun +p4 ./pgm $(TESTOPTS)
 
 bgtest: pgm
-       ./charmrun +p4 ./pgm +x2 +y2 +z2
+       ./charmrun +p4 ./pgm +x2 +y2 +z2 $(TESTOPTS)
 
 clean:
        rm -f conv-host pgm *.o *.decl.h *.def.h *~ charmrun charmrun.exe pgm.exe pgm.pdb pgm.ilk
index 2fb0ca7e8dadcb2bfee87b077aafa7a6b4a12c25..c26dbc0b732c747d9155ca63d8b8d38225f61762 100644 (file)
@@ -12,7 +12,7 @@ test: pgm
        ./charmrun +p4 ./pgm 8 $(TESTOPTS)
 
 bgtest: pgm
-       ./charmrun +p4 ./pgm 8 +x2 +y2 +z1
+       ./charmrun +p4 ./pgm 8 +x2 +y2 +z1 $(TESTOPTS)
 
 clean:
        rm -f conv-host pgm *.def.h *.decl.h *.o *~ charmrun charmrun.exe pgm.exe pgm.pdb pgm.ilk
index 31af8012357964039189f0c1b05e78c6aaa50f40..ae72278983f249eabfc383ba3585dc02676341ce 100644 (file)
@@ -7,7 +7,7 @@ all:
 
 test:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) test OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) test OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index 2b92a09754d61bff39a52dc82f1e72a19188e126..630751e28a755695b00baa2bd051d25c579c34e6 100644 (file)
@@ -135,6 +135,7 @@ CmiStartFn mymain()
     int otherPe = CmiMyPe() ^ 1;
     
 #if USE_PERSISTENT
+    if (CmiMyPe() < CmiNumPes())
     h = CmiCreatePersistent(otherPe, maxMsgSize+1024);
 #endif
     
index 44f0371066a5c5cb5c103cc41199e2651ed42d90..3b380bc28b051ceb888cab9f1ecfd1d08c8941f4 100644 (file)
@@ -12,7 +12,7 @@ test:
 
 bgtest:
        for d in $(DIRS); do \
-               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' || exit 1) || exit 1; \
+               (cd $$d; $(MAKE) bgtest OPTS='$(OPTS)' TESTOPTS='$(TESTOPTS)' || exit 1) || exit 1; \
        done
 
 clean:
index 9ef43269aa8a8eb2d32e13520e0cf9abdb215a65..d2be84b7243c9b87c91dbb0b7b1cf681a37ed63e 100644 (file)
@@ -16,7 +16,7 @@ test: pgm
        ./charmrun ./pgm +vp4 +p2 $(TESTOPTS)
 
 bgtest: pgm
-       ./charmrun ./pgm +vp4 +p2 +x2 +y2 +z1
+       ./charmrun ./pgm +vp4 +p2 +x2 +y2 +z1 $(TESTOPTS)
 
 clean:
        rm -f pgm fpgm *.o conv-host charmrun charmrun.exe pgm.exe pgm.pdb pgm.ilk
index 15d3ffd1ecebcc9c42fabc27bbab3a817e5c0591..38c8b7a8f515d93dcb41703cc70030c54a92c244 100644 (file)
@@ -897,9 +897,8 @@ void LrtsAbort(const char *message) {
  * In relations to some flags, some other delivery functions may be needed.
  */
 
-#if !CMK_MULTICAST_LIST_USE_COMMON_CODE
 
-void CmiSyncListSendFn(int npes, int *pes, int size, char *msg) {
+void LrtsSyncListSendFn(int npes, int *pes, int size, char *msg) {
     char *copymsg = CopyMsg(msg, size);
     CmiFreeListSendFn(npes, pes, size, copymsg);
 }
@@ -914,7 +913,7 @@ void CmiSyncListSendFn(int npes, int *pes, int size, char *msg) {
 #warning "Using Optimized Multicast"
 #endif
 
-void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) {
+void LrtsFreeListSendFn(int npes, int *pes, int size, char *msg) {
     CmiAssert(npes>=1);
     if (npes==1) {
         CmiFreeSendFn(pes[0], size, msg);
@@ -993,7 +992,12 @@ void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) {
 #endif
 #endif /* end of #if OPTIMIZED_MULTICAST */
 }
-#endif /* end of #if !CMK_MULTICAST_LIST_USE_COMMON_CODE */
+
+CmiCommHandle LrtsAsyncListSendFn(int npes, int *pes, int len, char *msg)
+{
+    CmiAbort("LrtsAsyncListSendFn not implemented.");
+    return 0;
+}
 
 /*********** End of MULTICAST/VECTOR SENDING FUNCTIONS **************/
 
index 4be2921f33093adaff94eab7fe796067f328af8b..85d83abdb6a211e9b24213b43215daee4da793a5 100644 (file)
@@ -9,7 +9,7 @@
    used in the memory files of converse */
 #define CMK_GETPAGESIZE_AVAILABLE                          1
 #define CMK_MEMORY_PAGESIZE                                8192
-#define CMK_MEMORY_PROTECTABLE                             1
+#define CMK_MEMORY_PROTECTABLE                             0
 
 /* defines which version of memory handlers should be used.
    used in conv-core/machine.c */
@@ -27,7 +27,7 @@
    one of them must be 1, all the others 0. The different implementations are in
    convserve.h Typically used are UNAVAILABLE for non SMP versions and
    POSIX_THREADS_SMP for SMP versions. The others are used only in special
-   cases: UNIPROCESSOR in sim and uth, PTHREADS in origin, EXEMPLAR in exemplar,
+   cases: UNIPROCESSOR in sim and uth, PTHREADS in origin,
    and NT_THREADS in windows. */
 #define CMK_SHARED_VARS_UNAVAILABLE                        1 /* non SMP versions */
 #define CMK_SHARED_VARS_POSIX_THREADS_SMP                  0 /* SMP versions */
@@ -52,6 +52,8 @@
 #define CMK_THREADS_USE_JCONTEXT                           0
 #define CMK_THREADS_USE_PTHREADS                           0
 
+#define CMK_USE_SPINLOCK                                   0
+
 /* Specifies what kind of timer to use, and the correspondent headers will be
    included in convcore.c. If none is selected, then the machine.c file needs to
    implement the timer primitives. */
index 7b84b7f6d432c4437974c170b6cd64bfd5513bc9..34a915a8557ea3df94552fddef56c910a472e241 100644 (file)
@@ -9,7 +9,11 @@
 
 #define CMK_HANDLE_SIGUSR                                  0
 
+#if CMK_ERROR_CHECKING
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt2 seq; unsigned char cksum, magic; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root; 
+#else
 #define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt4 seq; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root; 
+#endif
 
 #define CMK_MSG_HEADER_BASIC  CMK_MSG_HEADER_EXT
 #define CMK_MSG_HEADER_EXT    { CMK_MSG_HEADER_EXT_ }
index 5bcb72e577d4eeefca9b6db2a830a0b2e00f037a..8401e97154aa140005c065756b5c83812c01edcf 100644 (file)
   * persist_machine_init  // machine specific initialization call
 */
 
-#define LRTS_GNI_RDMA_PUT_THRESHOLD  2048
-void LrtsSendPersistentMsg(PersistentHandle h, int destPE, int size, void *m)
+void LrtsSendPersistentMsg(PersistentHandle h, int destNode, int size, void *m)
 {
     gni_post_descriptor_t *pd;
     gni_return_t status;
     RDMA_REQUEST        *rdma_request_msg;
     
-    CmiAssert(h!=NULL);
     PersistentSendsTable *slot = (PersistentSendsTable *)h;
-    CmiAssert(slot->used == 1);
-    CmiAssert(slot->destPE == destPE);
+    if (h==NULL) {
+        printf("[%d] LrtsSendPersistentMsg: handle from node %d to node %d is NULL. \n", CmiMyPe(), myrank, destNode);
+        CmiAbort("LrtsSendPersistentMsg: not a valid PersistentHandle");
+    }
+    CmiAssert(CmiNodeOf(slot->destPE) == destNode);
     if (size > slot->sizeMax) {
-        CmiPrintf("size: %d sizeMax: %d\n", size, slot->sizeMax);
+        CmiPrintf("size: %d sizeMax: %d mype=%d destPe=%d\n", size, slot->sizeMax, CmiMyPe(), destNode);
         CmiAbort("Abort: Invalid size\n");
     }
 
-    /* CmiPrintf("[%d] LrtsSendPersistentMsg h=%p hdl=%d destPE=%d destAddress=%p size=%d\n", CmiMyPe(), h, CmiGetHandler(m), destPE, slot->destBuf[0].destAddress, size); */
-
     if (slot->destBuf[0].destAddress) {
+        // CmiPrintf("[%d] LrtsSendPersistentMsg h=%p hdl=%d destNode=%d destAddress=%p size=%d\n", CmiMyPe(), h, CmiGetHandler(m), destNode, slot->destBuf[0].destAddress, size);
+
         // uGNI part
         MallocPostDesc(pd);
-#if USE_LRTS_MEMPOOL
-        if(size <= 2048){
-#else
-        if(size <= 16384){
-#endif
+        if(size <= LRTS_GNI_RDMA_THRESHOLD) {
             pd->type            = GNI_POST_FMA_PUT;
         }
         else
         {
             pd->type            = GNI_POST_RDMA_PUT;
-#if USE_LRTS_MEMPOOL
-            pd->local_mem_hndl  = GetMemHndl(m);
-#else
-            status = MEMORY_REGISTER(onesided_hnd, nic_hndl,  m, size, &(pd->local_mem_hndl), &omdh);
-#endif
-            GNI_RC_CHECK("Mem Register before post", status);
         }
         pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
         pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
-        pd->length          = size;
+        pd->length          = ALIGN64(size);
         pd->local_addr      = (uint64_t) m;
        
         pd->remote_addr     = (uint64_t)slot->destBuf[0].destAddress;
         pd->remote_mem_hndl = slot->destBuf[0].mem_hndl;
         pd->src_cq_hndl     = 0;//post_tx_cqh;     /* smsg_tx_cqh;  */
         pd->rdma_mode       = 0;
+        pd->cqwrite_value   = PERSIST_SEQ;
+        pd->amo_cmd         = 0;
+
+#if CMK_WITH_STATS 
+        pd->sync_flag_addr = 1000000 * CmiWallTimer(); //microsecond
+#endif
+        SetMemHndlZero(pd->local_mem_hndl);
 
-        if(pd->type == GNI_POST_RDMA_PUT) 
-            status = GNI_PostRdma(ep_hndl_array[destPE], pd);
+        //TRACE_COMM_CREATION(CpvAccess(projTraceStart), (void*)pd->local_addr);
+         /* always buffer */
+#if CMK_SMP || 1
+#if REMOTE_EVENT
+        bufferRdmaMsg(destNode, pd, (int)(size_t)(slot->destHandle));
+#else
+        bufferRdmaMsg(destNode, pd, -1);
+#endif
+
+#else                      /* non smp */
+
+#if REMOTE_EVENT
+        pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+        int sts = GNI_EpSetEventData(ep_hndl_array[destNode], destNode, PERSIST_EVENT((int)(size_t)(slot->destHandle)));
+        GNI_RC_CHECK("GNI_EpSetEventData", sts);
+#endif
+        status = registerMessage((void*)(pd->local_addr), pd->length, pd->cqwrite_value, &pd->local_mem_hndl);
+        if (status == GNI_RC_SUCCESS) 
+        {
+#if CMK_WITH_STATS
+            RDMA_TRY_SEND(pd->type)
+#endif
+            if(pd->type == GNI_POST_RDMA_PUT) 
+                status = GNI_PostRdma(ep_hndl_array[destNode], pd);
+            else
+                status = GNI_PostFma(ep_hndl_array[destNode],  pd);
+        }
         else
-            status = GNI_PostFma(ep_hndl_array[destPE],  pd);
+            status = GNI_RC_ERROR_RESOURCE;
         if(status == GNI_RC_ERROR_RESOURCE|| status == GNI_RC_ERROR_NOMEM )
         {
-            MallocRdmaRequest(rdma_request_msg);
-            rdma_request_msg->destNode = destPE;
-            rdma_request_msg->pd = pd;
-            PCQueuePush(sendRdmaBuf, (char*)rdma_request_msg);
-        }else
+#if REMOTE_EVENT
+            bufferRdmaMsg(destNode, pd, (int)(size_t)(slot->destHandle));
+#else
+            bufferRdmaMsg(destNode, pd, -1);
+#endif
+        }
+        else {
             GNI_RC_CHECK("AFter posting", status);
-    }
+#if  CMK_WITH_STATS
+            pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+            RDMA_TRANS_INIT(pd->type, pd->sync_flag_addr/1000000.0)
+#endif
+        }
+#endif
+  }
   else {
 #if 1
     if (slot->messageBuf != NULL) {
@@ -179,15 +210,32 @@ int PumpPersistent()
 
 #endif
 
+#if ! LARGEPAGE
+#error "Persistent communication must be compiled with LARGEPAGE on"
+#endif
+
 void *PerAlloc(int size)
 {
-  return CmiAlloc(size);
+//  return CmiAlloc(size);
+  gni_return_t status;
+  void *res = NULL;
+  char *ptr;
+  size = ALIGN64(size + sizeof(CmiChunkHeader));
+  //printf("[%d] PerAlloc %p %p %d. \n", myrank, res, ptr, size);
+  res = mempool_malloc(CpvAccess(mempool), ALIGNBUF+size-sizeof(mempool_header), 1);
+  if (res) ptr = (char*)res - sizeof(mempool_header) + ALIGNBUF;
+  SIZEFIELD(ptr)=size;
+  REFFIELD(ptr)= PERSIST_SEQ;
+  return ptr;
 }
                                                                                 
 void PerFree(char *msg)
 {
-  //elan_CmiStaticFree(msg);
-  CmiFree(msg);
+#if CMK_SMP
+  mempool_free_thread((char*)msg - ALIGNBUF + sizeof(mempool_header));
+#else
+  mempool_free(CpvAccess(mempool), (char*)msg - ALIGNBUF + sizeof(mempool_header));
+#endif
 }
 
 /* machine dependent init call */
@@ -195,25 +243,86 @@ void persist_machine_init(void)
 {
 }
 
+void initSendSlot(PersistentSendsTable *slot)
+{
+  int i;
+  slot->destPE = -1;
+  slot->sizeMax = 0;
+  slot->destHandle = 0; 
+#if 0
+  for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
+    slot->destAddress[i] = NULL;
+    slot->destSizeAddress[i] = NULL;
+  }
+#endif
+  memset(&slot->destBuf, 0, sizeof(PersistentBuf)*PERSIST_BUFFERS_NUM);
+  slot->messageBuf = 0;
+  slot->messageSize = 0;
+  slot->prev = slot->next = NULL;
+}
+
+void initRecvSlot(PersistentReceivesTable *slot)
+{
+  int i;
+#if 0
+  for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
+    slot->messagePtr[i] = NULL;
+    slot->recvSizePtr[i] = NULL;
+  }
+#endif
+  memset(&slot->destBuf, 0, sizeof(PersistentBuf)*PERSIST_BUFFERS_NUM);
+  slot->sizeMax = 0;
+  slot->index = -1;
+  slot->prev = slot->next = NULL;
+}
+
 void setupRecvSlot(PersistentReceivesTable *slot, int maxBytes)
 {
   int i;
-  gni_return_t status;
   for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
     char *buf = PerAlloc(maxBytes+sizeof(int)*2);
     _MEMCHECK(buf);
     memset(buf, 0, maxBytes+sizeof(int)*2);
+      /* used large page and from mempool, memory always registered */
+    slot->destBuf[i].mem_hndl = GetMemHndl(buf);
     slot->destBuf[i].destAddress = buf;
-    /* note: assume first integer in elan converse header is the msg size */
+      /* note: assume first integer in elan converse header is the msg size */
     slot->destBuf[i].destSizeAddress = (unsigned int*)buf;
-#if USE_LRTS_MEMPOOL
-    slot->destBuf[i].mem_hndl = GetMemHndl(buf);
-#else
-    status = MEMORY_REGISTER(onesided_hnd, nic_hndl,  buf, maxBytes+sizeof(int)*2 , &(slot->destBuf[i].mem_hndl), &omdh);
-    GNI_RC_CHECK("Mem Register before post", status);
-#endif
   }
   slot->sizeMax = maxBytes;
+#if REMOTE_EVENT
+#if !MULTI_THREAD_SEND
+  CmiLock(persistPool.lock);    /* locked in function */
+#endif
+  slot->index = IndexPool_getslot(&persistPool, slot, 2);
+#if !MULTI_THREAD_SEND
+  CmiUnlock(persistPool.lock);
+#endif
+#endif
 }
 
+void clearRecvSlot(PersistentReceivesTable *slot)
+{
+#if REMOTE_EVENT
+#if !MULTI_THREAD_SEND
+  CmiLock(persistPool.lock);
+#endif
+  IndexPool_freeslot(&persistPool, slot->index);
+#if !MULTI_THREAD_SEND
+  CmiUnlock(persistPool.lock);
+#endif
+#endif
+}
 
+PersistentHandle getPersistentHandle(PersistentHandle h, int toindex)
+{
+#if REMOTE_EVENT
+  if (toindex)
+    return (PersistentHandle)(((PersistentReceivesTable*)h)->index);
+  else {
+    return (PersistentHandle)GetIndexAddress(persistPool, (int)(size_t)h);
+  }
+#else
+  return h;
+#endif
+}
index 9ea6516ed5e20533164bbc3e66c34523198319bf..31a9bfa47f9637ab7e84a1e705531bc9acc4b953 100644 (file)
 
 #include "gni_pub.h"
 
+#define PERSIST_MIN_SIZE                SMSG_MAX_MSG
+
 #define PERSIST_BUFFERS_NUM             1
 
+#define PERSIST_SEQ                     0xFFFFFFF
+
+#define IS_PERSISTENT_MEMORY(ptr)          (REFFIELD(msg) > PERSIST_SEQ/2)
+
 typedef struct  _PersistentBuf {
   void *destAddress;
   void *destSizeAddress;
@@ -25,7 +31,7 @@ typedef struct _PersistentSendsTable {
   PersistentBuf     destBuf[PERSIST_BUFFERS_NUM];
   void *messageBuf;
   int messageSize;
-  char used;
+  struct _PersistentSendsTable *prev, *next;
 } PersistentSendsTable;
 
 typedef struct _PersistentReceivesTable {
@@ -35,21 +41,25 @@ typedef struct _PersistentReceivesTable {
 #endif
   PersistentBuf     destBuf[PERSIST_BUFFERS_NUM];
   int sizeMax;
+  size_t               index;
   struct _PersistentReceivesTable *prev, *next;
 } PersistentReceivesTable;
 
-extern PersistentReceivesTable *persistentReceivesTableHead;
-extern PersistentReceivesTable *persistentReceivesTableTail;
+CpvExtern(PersistentReceivesTable *, persistentReceivesTableHead);
+CpvExtern(PersistentReceivesTable *, persistentReceivesTableTail);
 
-extern PersistentHandle  *phs;
-extern int phsSize;
-extern int curphs;
+CpvExtern(PersistentHandle *, phs);
+CpvExtern(int, phsSize);
+CpvExtern(int, curphs);
 
+PersistentHandle getPersistentHandle(PersistentHandle h, int toindex);
 void *PerAlloc(int size);
 void PerFree(char *msg);
 int PumpPersistent();
 void swapSendSlotBuffers(PersistentSendsTable *slot);
 void swapRecvSlotBuffers(PersistentReceivesTable *slot);
 void setupRecvSlot(PersistentReceivesTable *slot, int maxBytes);
+void clearRecvSlot(PersistentReceivesTable *slot);
 
 /*@}*/
+
index 92fc72347b46736ab0cf6b5816c6191b1b6c827b..01f84e35e9ee4b93a9b7284100fd240f2608180c 100644 (file)
@@ -24,6 +24,7 @@
     export CHARM_UGNI_MAX_MEMORY_ON_NODE=0.8G  # max memory per node for mempool
     export CHARM_UGNI_BIG_MSG_SIZE=4M          # set big message size protocol
     export CHARM_UGNI_BIG_MSG_PIPELINE_LEN=4   # set big message pipe len
+    export CHARM_UGNI_RDMA_MAX=100             # max pending RDMA operations
  */
 /*@{*/
 
 #include <malloc.h>
 #include <unistd.h>
 #include <time.h>
-
+#include <sys/dir.h>
+#include <sys/stat.h>
 #include <gni_pub.h>
 #include <pmi.h>
 //#include <numatoolkit.h>
 
 #include "converse.h"
 
-#define     LARGEPAGE              0
-
-#if LARGEPAGE
-#include <hugetlbfs.h>
-#endif
-
 #if CMK_DIRECT
 #include "cmidirect.h"
 #endif
 
+#define     LARGEPAGE              0
+
 #if CMK_SMP
 #define MULTI_THREAD_SEND          0
-#define COMM_THREAD_SEND           1
+#define COMM_THREAD_SEND           (!MULTI_THREAD_SEND)
 #endif
 
-#if CMK_SMP && COMM_THREAD_SEND
-#define PIGGYBACK_ACK              0
+#if MULTI_THREAD_SEND
+#define CMK_WORKER_SINGLE_TASK     0
 #endif
 
+#define REMOTE_EVENT               1
+#define CQWRITE                    0
+
 #define CMI_EXERT_SEND_CAP     0
 #define        CMI_EXERT_RECV_CAP      0
+#define CMI_EXERT_RDMA_CAP      0
 
 #if CMI_EXERT_SEND_CAP
-#define SEND_CAP 16
+int SEND_large_cap = 100;
+int SEND_large_pending = 0;
 #endif
 
 #if CMI_EXERT_RECV_CAP
-#define RECV_CAP 2
+#define RECV_CAP  4                  /* cap <= 2 sometimes hang */
 #endif
 
-#define USE_LRTS_MEMPOOL                  1
+#if CMI_EXERT_RDMA_CAP
+int   RDMA_cap =   100;
+int   RDMA_pending = 0;
+#endif
 
-#define REMOTE_EVENT                      0
+#define USE_LRTS_MEMPOOL                  1
 
-#define PRINT_SYH  0
+#define PRINT_SYH                         0
 
 // Trace communication thread
 #if CMK_TRACE_ENABLED && CMK_SMP_TRACE_COMMTHREAD
@@ -118,11 +124,11 @@ static CmiInt8 _mempool_size_limit = 0;
 static CmiInt8 _totalmem = 0.8*oneGB;
 
 #if LARGEPAGE
-static int BIG_MSG  =  16*oneMB;
-static int ONE_SEG  =  4*oneMB;
+static CmiInt8 BIG_MSG  =  16*oneMB;
+static CmiInt8 ONE_SEG  =  4*oneMB;
 #else
-static int BIG_MSG  =  4*oneMB;
-static int ONE_SEG  =  2*oneMB;
+static CmiInt8 BIG_MSG  =  4*oneMB;
+static CmiInt8 ONE_SEG  =  2*oneMB;
 #endif
 #if MULTI_THREAD_SEND
 static int BIG_MSG_PIPELINE = 1;
@@ -152,14 +158,16 @@ static CmiInt8  MAX_REG_MEM    =  25*oneMB;
 
 #endif     /* end USE_LRTS_MEMPOOL */
 
-#if MULTI_THREAD_SEND
-#define     CMI_GNI_LOCK        CmiLock(tx_cq_lock);
-#define     CMI_GNI_UNLOCK        CmiUnlock(tx_cq_lock);
+#if MULTI_THREAD_SEND 
+#define     CMI_GNI_LOCK(x)       CmiLock(x);
+#define     CMI_GNI_TRYLOCK(x)       CmiTryLock(x)
+#define     CMI_GNI_UNLOCK(x)        CmiUnlock(x);
 #define     CMI_PCQUEUEPOP_LOCK(Q)   CmiLock((Q)->lock);
 #define     CMI_PCQUEUEPOP_UNLOCK(Q)    CmiUnlock((Q)->lock);
 #else
-#define     CMI_GNI_LOCK
-#define     CMI_GNI_UNLOCK
+#define     CMI_GNI_LOCK(x)
+#define     CMI_GNI_TRYLOCK(x)         (0)
+#define     CMI_GNI_UNLOCK(x)
 #define     CMI_PCQUEUEPOP_LOCK(Q)   
 #define     CMI_PCQUEUEPOP_UNLOCK(Q)
 #endif
@@ -176,7 +184,7 @@ static int _detected_hang = 0;
 #define             SMSG_ATTR_SIZE      sizeof(gni_smsg_attr_t)
 
 // dynamic SMSG
-static int useDynamicSMSG  =0;               /* dynamic smsgs setup */
+static int useDynamicSMSG 0;               /* dynamic smsgs setup */
 
 static int avg_smsg_connection = 32;
 static int                 *smsg_connected_flag= 0;
@@ -227,20 +235,21 @@ onesided_md_t    omdh;
 
 #else
 uint8_t   onesided_hnd, omdh;
-#if REMOTE_EVENT
-#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh, status)    if(register_memory_size+size>= MAX_REG_MEM) { \
-         status = GNI_RC_ERROR_NOMEM;} \
-        else {status = GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, smsg_rx_cqh,  GNI_MEM_READWRITE, -1, mem_hndl); \
-                if(status == GNI_RC_SUCCESS) register_memory_size += size; }  
+
+#if REMOTE_EVENT || CQWRITE 
+#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdhh, cqh, status) \
+    if(register_memory_size+size>= MAX_REG_MEM) { \
+        status = GNI_RC_ERROR_NOMEM;} \
+    else {status = GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, cqh,  GNI_MEM_READWRITE, -1, mem_hndl); \
+        if(status == GNI_RC_SUCCESS) register_memory_size += size; }  
 #else
-#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh, status ) \
-    do {   \
+#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh, cqh, status ) \
         if (register_memory_size + size >= MAX_REG_MEM) { \
             status = GNI_RC_ERROR_NOMEM; \
         } else { status = GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, NULL,  GNI_MEM_READWRITE, -1, mem_hndl); \
-            if(status == GNI_RC_SUCCESS) register_memory_size += size; } \
-    } while(0)
+            if(status == GNI_RC_SUCCESS) register_memory_size += size; } 
 #endif
+
 #define  MEMORY_DEREGISTER(handler, nic_hndl, mem_hndl, myomdh, size)  \
     do { if (GNI_MemDeregister(nic_hndl, (mem_hndl) ) == GNI_RC_SUCCESS) \
              register_memory_size -= size; \
@@ -248,23 +257,23 @@ uint8_t   onesided_hnd, omdh;
     } while (0)
 #endif
 
-#define   GetMempoolBlockPtr(x)  (((mempool_header*)((char*)(x)-ALIGNBUF))->block_ptr)
-#define   GetMempoolPtr(x)        GetMempoolBlockPtr(x)->mptr
-#define   GetMempoolsize(x)       GetMempoolBlockPtr(x)->size
-#define   GetMemHndl(x)           GetMempoolBlockPtr(x)->mem_hndl
-#define   IncreaseMsgInRecv(x)    (GetMempoolBlockPtr(x)->msgs_in_recv)++
-#define   DecreaseMsgInRecv(x)    (GetMempoolBlockPtr(x)->msgs_in_recv)--
-#define   IncreaseMsgInSend(x)    (GetMempoolBlockPtr(x)->msgs_in_send)++
-#define   DecreaseMsgInSend(x)    (GetMempoolBlockPtr(x)->msgs_in_send)--
-#define   NoMsgInSend(x)          GetMempoolBlockPtr(x)->msgs_in_send == 0
-#define   NoMsgInRecv(x)          GetMempoolBlockPtr(x)->msgs_in_recv == 0
-#define   NoMsgInFlight(x)        (GetMempoolBlockPtr(x)->msgs_in_send + GetMempoolBlockPtr(x)->msgs_in_recv  == 0)
+#define   GetMempoolBlockPtr(x)   MEMPOOL_GetBlockPtr(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   GetMempoolPtr(x)        MEMPOOL_GetMempoolPtr(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   GetMempoolsize(x)       MEMPOOL_GetSize(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   GetMemHndl(x)           MEMPOOL_GetMemHndl(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   IncreaseMsgInRecv(x)    MEMPOOL_IncMsgInRecv(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   DecreaseMsgInRecv(x)    MEMPOOL_DecMsgInRecv(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   IncreaseMsgInSend(x)    MEMPOOL_IncMsgInSend(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   DecreaseMsgInSend(x)    MEMPOOL_DecMsgInSend(MEMPOOL_GetMempoolHeader(x,ALIGNBUF))
+#define   NoMsgInSend(x)          MEMPOOL_GetMsgInSend(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) == 0
+#define   NoMsgInRecv(x)          MEMPOOL_GetMsgInRecv(MEMPOOL_GetMempoolHeader(x,ALIGNBUF)) == 0
+#define   NoMsgInFlight(x)        (NoMsgInSend(x) && NoMsgInRecv(x))
 #define   IsMemHndlZero(x)        ((x).qword1 == 0 && (x).qword2 == 0)
 #define   SetMemHndlZero(x)       do {(x).qword1 = 0;(x).qword2 = 0;} while (0)
-#define   NotRegistered(x)        IsMemHndlZero(((block_header*)x)->mem_hndl)
+#define   NotRegistered(x)        IsMemHndlZero(GetMemHndl(x))
 
-#define   GetMemHndlFromBlockHeader(x) ((block_header*)x)->mem_hndl
-#define   GetSizeFromBlockHeader(x)    ((block_header*)x)->size
+#define   GetMemHndlFromBlockHeader(x) MEMPOOL_GetBlockMemHndl(x)
+#define   GetSizeFromBlockHeader(x)    MEMPOOL_GetBlockSize(x)
 
 #define CmiGetMsgSize(m)     ((CmiMsgHeaderExt*)m)->size
 #define CmiSetMsgSize(m,s)   ((((CmiMsgHeaderExt*)m)->size)=(s))
@@ -281,11 +290,17 @@ uint8_t   onesided_hnd, omdh;
 
 /* If SMSG is used */
 static int  SMSG_MAX_MSG = 1024;
-#define SMSG_MAX_CREDIT 72 
+#define SMSG_MAX_CREDIT    72
 
 #define MSGQ_MAXSIZE       2048
+
 /* large message transfer with FMA or BTE */
+#if ! REMOTE_EVENT
 #define LRTS_GNI_RDMA_THRESHOLD  1024 
+#else
+   /* remote events only work with RDMA */
+#define LRTS_GNI_RDMA_THRESHOLD  0 
+#endif
 
 #if CMK_SMP
 static int  REMOTE_QUEUE_ENTRIES=163840; 
@@ -301,17 +316,15 @@ static int LOCAL_QUEUE_ENTRIES=20480;
 #define ACK_TAG                 0x30
 /* SMSG is data message */
 #define SMALL_DATA_TAG          0x31
-#define SMALL_DATA_ACK_TAG      0x32
 /* SMSG is a control message to initialize a BTE */
 #define LMSG_INIT_TAG           0x39 
-#define LMSG_INIT_ACK_TAG       0x3a 
 
 #define DEBUG
 #ifdef GNI_RC_CHECK
 #undef GNI_RC_CHECK
 #endif
 #ifdef DEBUG
-#define GNI_RC_CHECK(msg,rc) do { if(rc != GNI_RC_SUCCESS) {           printf("[%d] %s; err=%s\n",CmiMyPe(),msg,gni_err_str[rc]); CmiAbort("GNI_RC_CHECK"); } } while(0)
+#define GNI_RC_CHECK(msg,rc) do { if(rc != GNI_RC_SUCCESS) {           printf("[%d] %s; err=%s\n",CmiMyPe(),msg,gni_err_str[rc]); fflush(stdout); CmiAbort("GNI_RC_CHECK"); } } while(0)
 #else
 #define GNI_RC_CHECK(msg,rc)
 #endif
@@ -357,17 +370,21 @@ gni_msgq_ep_attr_t      msgq_ep_attrs_size;
 static int cookie;
 static int modes = 0;
 static gni_cq_handle_t       smsg_rx_cqh = NULL;
-static gni_cq_handle_t       smsg_tx_cqh = NULL;
-static gni_cq_handle_t       post_rx_cqh = NULL;
+static gni_cq_handle_t       default_tx_cqh = NULL;
+static gni_cq_handle_t       rdma_tx_cqh = NULL;
+static gni_cq_handle_t       rdma_rx_cqh = NULL;
 static gni_cq_handle_t       post_tx_cqh = NULL;
 static gni_ep_handle_t       *ep_hndl_array;
-#if MULTI_THREAD_SEND
+
 static CmiNodeLock           *ep_lock_array;
-static CmiNodeLock           tx_cq_lock; 
+static CmiNodeLock           default_tx_cq_lock; 
+static CmiNodeLock           rdma_tx_cq_lock; 
+static CmiNodeLock           global_gni_lock; 
 static CmiNodeLock           rx_cq_lock;
+static CmiNodeLock           smsg_mailbox_lock;
+static CmiNodeLock           smsg_rx_cq_lock;
 static CmiNodeLock           *mempool_lock;
-#endif
-
+//#define     CMK_WITH_STATS      1
 typedef struct msg_list
 {
     uint32_t destNode;
@@ -377,6 +394,9 @@ typedef struct msg_list
 #if !CMK_SMP
     struct msg_list *next;
 #endif
+#if CMK_WITH_STATS
+    double  creation_time;
+#endif
 }MSG_LIST;
 
 
@@ -386,6 +406,9 @@ typedef struct control_msg
     uint64_t            dest_addr;      /* address from the start of buffer */
     int                 total_length;   /* total length */
     int                 length;         /* length of this packet */
+#if REMOTE_EVENT
+    int                 ack_index;      /* index from integer to address */
+#endif
     uint8_t             seq_id;         //big message   0 meaning single message
     gni_mem_handle_t    source_mem_hndl;
     struct control_msg *next;
@@ -435,6 +458,9 @@ void CmiDirectInit()
 typedef struct  rmda_msg
 {
     int                   destNode;
+#if REMOTE_EVENT
+    int                   ack_index;
+#endif
     gni_post_descriptor_t *pd;
 #if !CMK_SMP
     struct  rmda_msg      *next;
@@ -443,7 +469,7 @@ typedef struct  rmda_msg
 
 
 #if CMK_SMP
-#define SMP_LOCKS               0
+#define SMP_LOCKS                       0
 #define ONE_SEND_QUEUE                  0
 PCQueue sendRdmaBuf;
 typedef struct  msg_list_index
@@ -484,9 +510,6 @@ typedef struct smsg_queue
 #endif
 
 SMSG_QUEUE                  smsg_queue;
-#if PIGGYBACK_ACK
-SMSG_QUEUE                  smsg_ack_queue;
-#endif
 #if CMK_USE_OOB
 SMSG_QUEUE                  smsg_oob_queue;
 #endif
@@ -628,45 +651,363 @@ static MSG_LIST *buffered_fma_tail = 0;
 
 CpvDeclare(mempool_type*, mempool);
 
+#if REMOTE_EVENT
+/* ack pool for remote events */
+
+static int  SHIFT   =           18;
+#define INDEX_MASK              ((1<<(32-SHIFT-1)) - 1)
+#define RANK_MASK               ((1<<SHIFT) - 1)
+#define ACK_EVENT(idx)          ((((idx) & INDEX_MASK)<<SHIFT) | myrank)
+
+#define GET_TYPE(evt)           (((evt) >> 31) & 1)
+#define GET_RANK(evt)           ((evt) & RANK_MASK)
+#define GET_INDEX(evt)          (((evt) >> SHIFT) & INDEX_MASK)
+
+#define PERSIST_EVENT(idx)      ( (1<<31) | (((idx) & INDEX_MASK)<<SHIFT) | myrank)
+
+#if CMK_SMP
+#define INIT_SIZE                4096
+#else
+#define INIT_SIZE                1024
+#endif
+
+struct IndexStruct {
+void *addr;
+int next;
+int type;     // 1: ACK   2: Persistent
+};
+
+typedef struct IndexPool {
+    struct IndexStruct   *indexes;
+    int                   size;
+    int                   freehead;
+    CmiNodeLock           lock;
+} IndexPool;
+
+static IndexPool  ackPool;
+#if CMK_PERSISTENT_COMM
+static IndexPool  persistPool;
+#endif
+
+#define  GetIndexType(pool, s)             (pool.indexes[s].type)
+#define  GetIndexAddress(pool, s)          (pool.indexes[s].addr)
+
+static void IndexPool_init(IndexPool *pool)
+{
+    int i;
+    if ((1<<SHIFT) < mysize) 
+        CmiAbort("Charm++ Error: Remote event's rank field overflow.");
+    pool->size = INIT_SIZE;
+    if ( (1<<(31-SHIFT)) < pool->size) CmiAbort("IndexPool_init: pool initial size is too big.");
+    pool->indexes = (struct IndexStruct *)malloc(pool->size*sizeof(struct IndexStruct));
+    for (i=0; i<pool->size-1; i++) {
+        pool->indexes[i].next = i+1;
+        pool->indexes[i].type = 0;
+    }
+    pool->indexes[i].next = -1;
+    pool->freehead = 0;
+#if MULTI_THREAD_SEND || CMK_PERSISTENT_COMM
+    pool->lock  = CmiCreateLock();
+#else
+    pool->lock  = 0;
+#endif
+}
+
+static
+inline int IndexPool_getslot(IndexPool *pool, void *addr, int type)
+{
+    int s, i;
+#if MULTI_THREAD_SEND  
+    CmiLock(pool->lock);
+#endif
+    s = pool->freehead;
+    if (s == -1) {
+        int newsize = pool->size * 2;
+        //printf("[%d] IndexPool_getslot %p expand to: %d\n", myrank, pool, newsize);
+        if (newsize > (1<<(32-SHIFT-1))) CmiAbort("IndexPool too large");
+        struct IndexStruct *old_ackpool = pool->indexes;
+        pool->indexes = (struct IndexStruct *)malloc(newsize*sizeof(struct IndexStruct));
+        memcpy(pool->indexes, old_ackpool, pool->size*sizeof(struct IndexStruct));
+        for (i=pool->size; i<newsize-1; i++) {
+            pool->indexes[i].next = i+1;
+            pool->indexes[i].type = 0;
+        }
+        pool->indexes[i].next = -1;
+        pool->indexes[i].type = 0;
+        pool->freehead = pool->size;
+        s = pool->size;
+        pool->size = newsize;
+        free(old_ackpool);
+    }
+    pool->freehead = pool->indexes[s].next;
+    pool->indexes[s].addr = addr;
+    CmiAssert(pool->indexes[s].type == 0 && (type == 1 || type == 2));
+    pool->indexes[s].type = type;
+#if MULTI_THREAD_SEND
+    CmiUnlock(pool->lock);
+#endif
+    return s;
+}
+
+static
+inline  void IndexPool_freeslot(IndexPool *pool, int s)
+{
+    CmiAssert(s>=0 && s<pool->size);
+#if MULTI_THREAD_SEND
+    CmiLock(pool->lock);
+#endif
+    pool->indexes[s].next = pool->freehead;
+    pool->indexes[s].type = 0;
+    pool->freehead = s;
+#if MULTI_THREAD_SEND
+    CmiUnlock(pool->lock);
+#endif
+}
+
+
+#endif
+
+/* =====Beginning of Definitions of Message-Corruption Related Macros=====*/
+#define CMI_MAGIC(msg)                   ((CmiMsgHeaderBasic *)msg)->magic
+#define CHARM_MAGIC_NUMBER               126
+
+#if CMK_ERROR_CHECKING
+extern unsigned char computeCheckSum(unsigned char *data, int len);
+static int checksum_flag = 0;
+#define CMI_SET_CHECKSUM(msg, len)      \
+        if (checksum_flag)  {   \
+          ((CmiMsgHeaderBasic *)msg)->cksum = 0;        \
+          ((CmiMsgHeaderBasic *)msg)->cksum = computeCheckSum((unsigned char*)msg, len);        \
+        }
+#define CMI_CHECK_CHECKSUM(msg, len)    \
+        if (checksum_flag)      \
+          if (computeCheckSum((unsigned char*)msg, len) != 0)   \
+            CmiAbort("Fatal error: checksum doesn't agree!\n");
+#else
+#define CMI_SET_CHECKSUM(msg, len)
+#define CMI_CHECK_CHECKSUM(msg, len)
+#endif
+/* =====End of Definitions of Message-Corruption Related Macros=====*/
+
+static int print_stats = 0;
+static int stats_off = 0;
+void CmiTurnOnStats()
+{
+    stats_off = 0;
+    //CmiPrintf("[%d][%d:%d]+++++++++++ turning on stats \n", CmiMyNode(), CmiMyPe(), CmiMyRank());
+}
+
+void CmiTurnOffStats()
+{
+    stats_off = 1;
+}
+
+#define IS_PUT(type)    (type == GNI_POST_FMA_PUT || type == GNI_POST_RDMA_PUT)
+
 #if CMK_WITH_STATS
+FILE *counterLog = NULL;
 typedef struct comm_thread_stats
 {
-int      count_in_send_buffered_ack;
-double   time_in_send_buffered_ack;
-double   max_time_in_send_buffered_ack;
-int      count_in_send_buffered_smsg;
-double   time_in_send_buffered_smsg;
-double   max_time_in_send_buffered_smsg;
+    uint64_t  smsg_data_count;
+    uint64_t  lmsg_init_count;
+    uint64_t  ack_count;
+    uint64_t  big_msg_ack_count;
+    uint64_t  smsg_count;
+    uint64_t  direct_put_done_count;
+    uint64_t  put_done_count;
+    //times of calling SmsgSend
+    uint64_t  try_smsg_data_count;
+    uint64_t  try_lmsg_init_count;
+    uint64_t  try_ack_count;
+    uint64_t  try_big_msg_ack_count;
+    uint64_t  try_direct_put_done_count;
+    uint64_t  try_put_done_count;
+    uint64_t  try_smsg_count;
+    
+    double    max_time_in_send_buffered_smsg;
+    double    all_time_in_send_buffered_smsg;
+
+    uint64_t  rdma_get_count, rdma_put_count;
+    uint64_t  try_rdma_get_count, try_rdma_put_count;
+    double    max_time_from_control_to_rdma_init;
+    double    all_time_from_control_to_rdma_init;
+
+    double    max_time_from_rdma_init_to_rdma_done;
+    double    all_time_from_rdma_init_to_rdma_done;
+
+    int      count_in_PumpNetwork;
+    double   time_in_PumpNetwork;
+    double   max_time_in_PumpNetwork;
+    int      count_in_SendBufferMsg_smsg;
+    double   time_in_SendBufferMsg_smsg;
+    double   max_time_in_SendBufferMsg_smsg;
+    int      count_in_SendRdmaMsg;
+    double   time_in_SendRdmaMsg;
+    double   max_time_in_SendRdmaMsg;
+    int      count_in_PumpRemoteTransactions;
+    double   time_in_PumpRemoteTransactions;
+    double   max_time_in_PumpRemoteTransactions;
+    int      count_in_PumpLocalTransactions_rdma;
+    double   time_in_PumpLocalTransactions_rdma;
+    double   max_time_in_PumpLocalTransactions_rdma;
+    int      count_in_PumpDatagramConnection;
+    double   time_in_PumpDatagramConnection;
+    double   max_time_in_PumpDatagramConnection;
 } Comm_Thread_Stats;
 
 static Comm_Thread_Stats   comm_stats;
 
+static char *counters_dirname = "counters";
+
 static void init_comm_stats()
 {
   memset(&comm_stats, 0, sizeof(Comm_Thread_Stats));
+  if (print_stats){
+      char ln[200];
+      int code = mkdir(counters_dirname, 00777); 
+      sprintf(ln,"%s/statistics.%d.%d", counters_dirname, mysize, myrank);
+      counterLog=fopen(ln,"w");
+      if (counterLog == NULL) CmiAbort("Counter files open failed");
+  }
 }
 
-#define STATS_ACK_TIME(x)   \
+#define SMSG_CREATION( x ) if(print_stats) { x->creation_time = CmiWallTimer(); }
+
+#define SMSG_SENT_DONE(creation_time, tag)  \
+        if (print_stats && !stats_off) {   if( tag == SMALL_DATA_TAG) comm_stats.smsg_data_count++;  \
+            else  if( tag == LMSG_INIT_TAG) comm_stats.lmsg_init_count++;  \
+            else  if( tag == ACK_TAG) comm_stats.ack_count++;  \
+            else  if( tag == BIG_MSG_TAG) comm_stats.big_msg_ack_count++;  \
+            else  if( tag == PUT_DONE_TAG ) comm_stats.put_done_count++;  \
+            else  if( tag == DIRECT_PUT_DONE_TAG ) comm_stats.direct_put_done_count++;  \
+            comm_stats.smsg_count++; \
+            double inbuff_time = CmiWallTimer() - creation_time;   \
+            if(inbuff_time > comm_stats.max_time_in_send_buffered_smsg) comm_stats.max_time_in_send_buffered_smsg= inbuff_time; \
+            comm_stats.all_time_in_send_buffered_smsg += inbuff_time;  \
+        }
+
+#define SMSG_TRY_SEND(tag)  \
+        if (print_stats && !stats_off){   if( tag == SMALL_DATA_TAG) comm_stats.try_smsg_data_count++;  \
+            else  if( tag == LMSG_INIT_TAG) comm_stats.try_lmsg_init_count++;  \
+            else  if( tag == ACK_TAG) comm_stats.try_ack_count++;  \
+            else  if( tag == BIG_MSG_TAG) comm_stats.try_big_msg_ack_count++;  \
+            else  if( tag == PUT_DONE_TAG ) comm_stats.try_put_done_count++;  \
+            else  if( tag == DIRECT_PUT_DONE_TAG ) comm_stats.try_direct_put_done_count++;  \
+            comm_stats.try_smsg_count++; \
+        }
+
+#define  RDMA_TRY_SEND(type)        if (print_stats && !stats_off) {IS_PUT(type)?comm_stats.try_rdma_put_count++:comm_stats.try_rdma_get_count++;}
+
+#define  RDMA_TRANS_DONE(x)      \
+         if (print_stats && !stats_off) {  double rdma_trans_time = CmiWallTimer() - x ; \
+             if(rdma_trans_time > comm_stats.max_time_from_rdma_init_to_rdma_done) comm_stats.max_time_from_rdma_init_to_rdma_done = rdma_trans_time; \
+             comm_stats.all_time_from_rdma_init_to_rdma_done += rdma_trans_time; \
+         }
+
+#define  RDMA_TRANS_INIT(type, x)      \
+         if (print_stats && !stats_off) {   IS_PUT(type)?comm_stats.rdma_put_count++:comm_stats.rdma_get_count++;  \
+             double rdma_trans_time = CmiWallTimer() - x ; \
+             if(rdma_trans_time > comm_stats.max_time_from_control_to_rdma_init) comm_stats.max_time_from_control_to_rdma_init = rdma_trans_time; \
+             comm_stats.all_time_from_control_to_rdma_init += rdma_trans_time; \
+         }
+
+#define STATS_PUMPNETWORK_TIME(x)   \
         { double t = CmiWallTimer(); \
           x;        \
           t = CmiWallTimer() - t;          \
-          comm_stats.count_in_send_buffered_ack ++;        \
-          comm_stats.time_in_send_buffered_ack += t;   \
-          if (t>comm_stats.max_time_in_send_buffered_ack)      \
-              comm_stats.max_time_in_send_buffered_ack = t;    \
+          comm_stats.count_in_PumpNetwork++;        \
+          comm_stats.time_in_PumpNetwork += t;   \
+          if (t>comm_stats.max_time_in_PumpNetwork)      \
+              comm_stats.max_time_in_PumpNetwork = t;    \
+        }
+
+#define STATS_PUMPREMOTETRANSACTIONS_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_PumpRemoteTransactions ++;        \
+          comm_stats.time_in_PumpRemoteTransactions += t;   \
+          if (t>comm_stats.max_time_in_PumpRemoteTransactions)      \
+              comm_stats.max_time_in_PumpRemoteTransactions = t;    \
+        }
+
+#define STATS_PUMPLOCALTRANSACTIONS_RDMA_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_PumpLocalTransactions_rdma ++;        \
+          comm_stats.time_in_PumpLocalTransactions_rdma += t;   \
+          if (t>comm_stats.max_time_in_PumpLocalTransactions_rdma)      \
+              comm_stats.max_time_in_PumpLocalTransactions_rdma = t;    \
+        }
+
+#define STATS_SEND_SMSGS_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_SendBufferMsg_smsg ++;        \
+          comm_stats.time_in_SendBufferMsg_smsg += t;   \
+          if (t>comm_stats.max_time_in_SendBufferMsg_smsg)      \
+              comm_stats.max_time_in_SendBufferMsg_smsg = t;    \
+        }
+
+#define STATS_SENDRDMAMSG_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_SendRdmaMsg ++;        \
+          comm_stats.time_in_SendRdmaMsg += t;   \
+          if (t>comm_stats.max_time_in_SendRdmaMsg)      \
+              comm_stats.max_time_in_SendRdmaMsg = t;    \
+        }
+
+#define STATS_PUMPDATAGRAMCONNECTION_TIME(x)   \
+        { double t = CmiWallTimer(); \
+          x;        \
+          t = CmiWallTimer() - t;          \
+          comm_stats.count_in_PumpDatagramConnection ++;        \
+          comm_stats.time_in_PumpDatagramConnection += t;   \
+          if (t>comm_stats.max_time_in_PumpDatagramConnection)      \
+              comm_stats.max_time_in_PumpDatagramConnection = t;    \
         }
 
 static void print_comm_stats()
 {
-    printf("PE[%d]  count/time in send buffered ack:   %d %f\n",  myrank, comm_stats.count_in_send_buffered_ack, comm_stats.time_in_send_buffered_ack);
-    printf("PE[%d]  max time in send buffered ack:     %f\n",  myrank, comm_stats.max_time_in_send_buffered_ack);
+    fprintf(counterLog, "Node[%d] SMSG time in buffer\t[total:%f\tmax:%f\tAverage:%f](milisecond)\n", myrank, 1000.0*comm_stats.all_time_in_send_buffered_smsg, 1000.0*comm_stats.max_time_in_send_buffered_smsg, 1000.0*comm_stats.all_time_in_send_buffered_smsg/comm_stats.smsg_count);
+    fprintf(counterLog, "Node[%d] Smsg  Msgs  \t[Total:%lld\t Data:%lld\t Lmsg_Init:%lld\t ACK:%lld\t BIG_MSG_ACK:%lld Direct_put_done:%lld\t Persistent_put_done:%lld]\n", myrank, 
+            comm_stats.smsg_count, comm_stats.smsg_data_count, comm_stats.lmsg_init_count, 
+            comm_stats.ack_count, comm_stats.big_msg_ack_count, comm_stats.direct_put_done_count, comm_stats.put_done_count);
+    
+    fprintf(counterLog, "Node[%d] SmsgSendCalls\t[Total:%lld\t Data:%lld\t Lmsg_Init:%lld\t ACK:%lld\t BIG_MSG_ACK:%lld Direct_put_done:%lld\t Persistent_put_done:%lld]\n\n", myrank, 
+            comm_stats.try_smsg_count, comm_stats.try_smsg_data_count, comm_stats.try_lmsg_init_count, 
+            comm_stats.try_ack_count, comm_stats.try_big_msg_ack_count, comm_stats.try_direct_put_done_count, comm_stats.try_put_done_count);
+
+    fprintf(counterLog, "Node[%d] Rdma Transaction [count (GET/PUT):%lld %lld\t calls (GET/PUT):%lld %lld]\n", myrank, comm_stats.rdma_get_count, comm_stats.rdma_put_count, comm_stats.try_rdma_get_count, comm_stats.try_rdma_put_count);
+    fprintf(counterLog, "Node[%d] Rdma time from control arrives to rdma init [Total:%f\tMAX:%f\t Average:%f](milisecond)\n", myrank, 1000.0*comm_stats.all_time_from_control_to_rdma_init, 1000.0*comm_stats.max_time_from_control_to_rdma_init, 1000.0*comm_stats.all_time_from_control_to_rdma_init/(comm_stats.rdma_get_count+comm_stats.rdma_put_count)); 
+    fprintf(counterLog, "Node[%d] Rdma time from init to rdma done [Total:%f\tMAX:%f\t Average:%f](milisecond)\n\n", myrank,1000.0*comm_stats.all_time_from_rdma_init_to_rdma_done, 1000.0*comm_stats.max_time_from_rdma_init_to_rdma_done, 1000.0*comm_stats.all_time_from_rdma_init_to_rdma_done/(comm_stats.rdma_get_count+comm_stats.rdma_put_count));
+
+
+    fprintf(counterLog, "                             count\ttotal(s)\tmax(s)\taverage(us)\n");
+    fprintf(counterLog, "PumpNetworkSmsg:              %d\t%.6f\t%.6f\t%.6f\n", comm_stats.count_in_PumpNetwork, comm_stats.time_in_PumpNetwork, comm_stats.max_time_in_PumpNetwork, comm_stats.time_in_PumpNetwork*1e6/comm_stats.count_in_PumpNetwork);
+    fprintf(counterLog, "PumpRemoteTransactions:       %d\t%.6f\t%.6f\t%.6f\n", comm_stats.count_in_PumpRemoteTransactions, comm_stats.time_in_PumpRemoteTransactions, comm_stats.max_time_in_PumpRemoteTransactions, comm_stats.time_in_PumpRemoteTransactions*1e6/comm_stats.count_in_PumpRemoteTransactions);
+    fprintf(counterLog, "PumpLocalTransactions(RDMA):  %d\t%.6f\t%.6f\t%.6f\n", comm_stats.count_in_PumpLocalTransactions_rdma, comm_stats.time_in_PumpLocalTransactions_rdma, comm_stats.max_time_in_PumpLocalTransactions_rdma, comm_stats.time_in_PumpLocalTransactions_rdma*1e6/comm_stats.count_in_PumpLocalTransactions_rdma);
+    fprintf(counterLog, "SendBufferMsg (SMSG):         %d\t%.6f\t%.6f\t%.6f\n",  comm_stats.count_in_SendBufferMsg_smsg, comm_stats.time_in_SendBufferMsg_smsg, comm_stats.max_time_in_SendBufferMsg_smsg, comm_stats.time_in_SendBufferMsg_smsg*1e6/comm_stats.count_in_SendBufferMsg_smsg);
+    fprintf(counterLog, "SendRdmaMsg:                  %d\t%.6f\t%.6f\t%.6f\n",  comm_stats.count_in_SendRdmaMsg, comm_stats.time_in_SendRdmaMsg, comm_stats.max_time_in_SendRdmaMsg, comm_stats.time_in_SendRdmaMsg*1e6/comm_stats.count_in_SendRdmaMsg);
+    if (useDynamicSMSG)
+    fprintf(counterLog, "PumpDatagramConnection:                  %d\t%.6f\t%.6f\t%.6f\n",  comm_stats.count_in_PumpDatagramConnection, comm_stats.time_in_PumpDatagramConnection, comm_stats.max_time_in_PumpDatagramConnection, comm_stats.time_in_PumpDatagramConnection*1e6/comm_stats.count_in_PumpDatagramConnection);
+
+    fclose(counterLog);
 }
+
 #else
-#define STATS_ACK_TIME(x)            x
+#define STATS_PUMPNETWORK_TIME(x)                  x
+#define STATS_SEND_SMSGS_TIME(x)                   x
+#define STATS_PUMPREMOTETRANSACTIONS_TIME(x)       x
+#define STATS_PUMPLOCALTRANSACTIONS_RDMA_TIME(x)   x
+#define STATS_SENDRDMAMSG_TIME(x)                  x
+#define STATS_PUMPDATAGRAMCONNECTION_TIME(x)       x
 #endif
 
-static int print_stats = 0;
-
 static void
 allgather(void *in,void *out, int len)
 {
@@ -814,6 +1155,7 @@ static uint32_t get_cookie(void)
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <sys/mman.h>
+#include <hugetlbfs.h>
 
 // size must be _tlbpagesize aligned
 void *my_get_huge_pages(size_t size)
@@ -860,10 +1202,16 @@ void CmiMachineProgressImpl() {
 }
 #endif
 
+static int SendBufferMsg(SMSG_QUEUE *queue);
 static void SendRdmaMsg();
 static void PumpNetworkSmsg();
-static void PumpLocalRdmaTransactions();
-static int SendBufferMsg(SMSG_QUEUE *queue);
+static void PumpLocalTransactions(gni_cq_handle_t tx_cqh, CmiNodeLock cq_lock);
+#if CQWRITE
+static void PumpCqWriteTransactions();
+#endif
+#if REMOTE_EVENT
+static void PumpRemoteTransactions();
+#endif
 
 #if MACHINE_DEBUG_LOG
 FILE *debugLog = NULL;
@@ -879,7 +1227,7 @@ static void sweep_mempool(mempool_type *mptr)
 
     printf("[n %d %d] sweep_mempool slot START.\n", myrank, n++);
     while( current!= NULL) {
-        printf("[n %d %d] sweep_mempool slot %p size: %d (%d %d) %lld %lld.\n", myrank, n++, current, current->size, current->msgs_in_send, current->msgs_in_recv, current->mem_hndl.qword1, current->mem_hndl.qword2);
+        printf("[n %d %d] sweep_mempool slot %p size: %lld used: %d (%d %d) %lld %lld.\n", myrank, n++, current, current->size, 1<<current->used, current->msgs_in_send, current->msgs_in_recv, current->mem_hndl.qword1, current->mem_hndl.qword2);
         current = current->block_next?(block_header *)((char*)mptr+current->block_next):NULL;
     }
     printf("[n %d] sweep_mempool slot END.\n", myrank);
@@ -904,7 +1252,7 @@ static  gni_return_t deregisterMemory(mempool_type *mptr, block_header **from)
 }
 
 inline 
-static gni_return_t registerFromMempool(mempool_type *mptr, void *blockaddr, size_t size, gni_mem_handle_t  *memhndl)
+static gni_return_t registerFromMempool(mempool_type *mptr, void *blockaddr, size_t size, gni_mem_handle_t  *memhndl, gni_cq_handle_t cqh )
 {
     gni_return_t status = GNI_RC_SUCCESS;
     //int size = GetMempoolsize(msg);
@@ -922,14 +1270,14 @@ static gni_return_t registerFromMempool(mempool_type *mptr, void *blockaddr, siz
     MACHSTATE3(8, "mempool (%lld,%lld,%d) \n", buffered_send_msg, buffered_recv_msg, register_memory_size); 
     while(1)
     {
-        MEMORY_REGISTER(onesided_hnd, nic_hndl, blockaddr, size, memhndl, &omdh, status);
+        MEMORY_REGISTER(onesided_hnd, nic_hndl, blockaddr, size, memhndl, &omdh, cqh, status);
         if(status == GNI_RC_SUCCESS)
         {
             break;
         }
         else if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR)
         {
-            CmiAbort("Memory registor for mempool fails\n");
+            GNI_RC_CHECK("registerFromMempool", status);
         }
         else
         {
@@ -941,7 +1289,7 @@ static gni_return_t registerFromMempool(mempool_type *mptr, void *blockaddr, siz
 }
 
 inline 
-static gni_return_t registerMemory(void *msg, size_t size, gni_mem_handle_t *t)
+static gni_return_t registerMemory(void *msg, size_t size, gni_mem_handle_t *t, gni_cq_handle_t cqh )
 {
     static int rank = -1;
     int i;
@@ -950,14 +1298,14 @@ static gni_return_t registerMemory(void *msg, size_t size, gni_mem_handle_t *t)
     //mempool_type *mptr1 = (mempool_type*)GetMempoolPtr(msg);
     mempool_type *mptr;
 
-    status = registerFromMempool(mptr1, msg, size, t);
+    status = registerFromMempool(mptr1, msg, size, t, cqh);
     if (status == GNI_RC_SUCCESS) return status;
 #if CMK_SMP 
     for (i=0; i<CmiMyNodeSize()+1; i++) {
       rank = (rank+1)%(CmiMyNodeSize()+1);
       mptr = CpvAccessOther(mempool, rank);
       if (mptr == mptr1) continue;
-      status = registerFromMempool(mptr, msg, size, t);
+      status = registerFromMempool(mptr, msg, size, t, cqh);
       if (status == GNI_RC_SUCCESS) return status;
     }
 #endif
@@ -973,7 +1321,9 @@ static void buffer_small_msgs(SMSG_QUEUE *queue, void *msg, int size, int destNo
     msg_tmp->size   = size;
     msg_tmp->msg    = msg;
     msg_tmp->tag    = tag;
-
+#if CMK_WITH_STATS
+    SMSG_CREATION(msg_tmp)
+#endif
 #if !CMK_SMP
     if (queue->smsg_msglist_index[destNode].sendSmsgBuf == 0 ) {
         queue->smsg_msglist_index[destNode].next = queue->smsg_head_index;
@@ -1047,13 +1397,14 @@ static void setup_smsg_connection(int destNode)
     smsg_available_slot++;
     MallocPostDesc(pd);
     pd->type            = GNI_POST_FMA_PUT;
-    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT |  GNI_CQMODE_REMOTE_EVENT;
+    //pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT |  GNI_CQMODE_REMOTE_EVENT;
+    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT ;
     pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
     pd->length          = sizeof(gni_smsg_attr_t);
     pd->local_addr      = (uint64_t) smsg_attr;
     pd->remote_addr     = (uint64_t)&((((gni_smsg_attr_t*)(smsg_connection_vec[destNode].addr))[myrank]));
     pd->remote_mem_hndl = smsg_connection_vec[destNode].mdh;
-    pd->src_cq_hndl     = 0;
+    pd->src_cq_hndl     = rdma_tx_cqh;
     pd->rdma_mode       = 0;
     status = GNI_PostFma(ep_hndl_array[destNode],  pd);
     print_smsg_attr(smsg_attr);
@@ -1118,9 +1469,9 @@ static int connect_to(int destNode)
     alloc_smsg_attr(smsg_attr_vector_local[destNode]);
     smsg_attr_vector_remote[destNode] = (gni_smsg_attr_t*) malloc (sizeof(gni_smsg_attr_t));
     
-    CMI_GNI_LOCK
+    CMI_GNI_LOCK(global_gni_lock)
     status = GNI_EpPostDataWId (ep_hndl_array[destNode], smsg_attr_vector_local[destNode], sizeof(gni_smsg_attr_t),smsg_attr_vector_remote[destNode] ,sizeof(gni_smsg_attr_t), destNode+mysize);
-    CMI_GNI_UNLOCK
+    CMI_GNI_UNLOCK(global_gni_lock)
     if (status == GNI_RC_ERROR_RESOURCE) {
       /* possibly destNode is making connection at the same time */
       free(smsg_attr_vector_local[destNode]);
@@ -1128,64 +1479,21 @@ static int connect_to(int destNode)
       free(smsg_attr_vector_remote[destNode]);
       smsg_attr_vector_remote[destNode] = NULL;
       mailbox_list->offset -= smsg_memlen;
+#if PRINT_SYH
+    printf("[%d] send connect_to request to %d failed\n", myrank, destNode);
+#endif
       return 0;
     }
     GNI_RC_CHECK("GNI_Post", status);
     smsg_connected_flag[destNode] = 1;
+#if PRINT_SYH
+    printf("[%d] send connect_to request to %d done\n", myrank, destNode);
+#endif
     return 1;
 }
 
-#if PIGGYBACK_ACK
-static void * piggyback_ack(int destNode, int msgsize, int *count)
-{
-    int i;
-    if (PCQueueEmpty(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf)) return NULL;
-    int len = PCQueueLength(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf);
-    int piggycount = (SMSG_MAX_MSG - msgsize)/sizeof(uint64_t);
-    if (piggycount > len+1) piggycount = len + 1;
-    if (piggycount <= 5) return NULL;
-    uint64_t * buf = (uint64_t*)CmiTmpAlloc(piggycount * sizeof(uint64_t));
-    CmiAssert(buf != NULL);
-//printf("[%d] piggyback_ack: %d\n", myrank, piggycount);
-    for (i=0; i<piggycount-1; i++) {
-        CMI_PCQUEUEPOP_LOCK(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf)
-        MSG_LIST *ptr = (MSG_LIST*)PCQueuePop(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf);
-        CMI_PCQUEUEPOP_UNLOCK(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf)
-        ACK_MSG *msg = ptr->msg;
-        buf[i+1] = msg->source_addr;
-        FreeAckMsg(msg);
-        FreeMsgList(ptr);
-    }
-    buf[0] = i;
-    *count = i + 1;
-    return buf;
-}
-
-
-static void piggyback_ack_done(int destNode, uint64_t *buf, int done)
-{
-    if (!done)
-    {
-        int i;
-        for (i=0; i<buf[0]; i++) {
-            MSG_LIST *msg_tmp;
-            MallocMsgList(msg_tmp);
-            ACK_MSG  *ack_msg;
-            MallocAckMsg(ack_msg);
-            ack_msg->source_addr = buf[i+1];
-            msg_tmp->size = ACK_MSG_SIZE;
-            msg_tmp->msg = ack_msg;
-            msg_tmp->tag = ACK_TAG;
-            msg_tmp->destNode = destNode;
-            PCQueuePush(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf, (char*)msg_tmp);
-        }
-    }
-    CmiTmpFree(buf);
-}
-#endif
-
 inline 
-static gni_return_t send_smsg_message(SMSG_QUEUE *queue, int destNode, void *msg, int size, uint8_t tag, int inbuff )
+static gni_return_t send_smsg_message(SMSG_QUEUE *queue, int destNode, void *msg, int size, uint8_t tag, int inbuff, MSG_LIST *ptr )
 {
     unsigned int          remote_address;
     uint32_t              remote_id;
@@ -1215,19 +1523,8 @@ static gni_return_t send_smsg_message(SMSG_QUEUE *queue, int destNode, void *msg
     if(queue->smsg_msglist_index[destNode].sendSmsgBuf == 0 || inbuff==1)
     {
 #endif
-        uint64_t *buf = NULL;
-        int bufsize = 0;
-#if PIGGYBACK_ACK
-        if (tag == SMALL_DATA_TAG || tag == LMSG_INIT_TAG) {
-            int nack = 0;
-            buf = piggyback_ack(destNode, size, &nack);
-            if (buf) {
-                tag = (tag == SMALL_DATA_TAG) ? SMALL_DATA_ACK_TAG : LMSG_INIT_ACK_TAG;
-                bufsize = nack * sizeof(uint64_t);
-            }
-        }
-#endif
-        CMI_GNI_LOCK
+        //CMI_GNI_LOCK(smsg_mailbox_lock)
+        CMI_GNI_LOCK(default_tx_cq_lock)
 #if CMK_SMP_TRACE_COMMTHREAD
         int oldpe = -1;
         int oldeventid = -1;
@@ -1242,28 +1539,44 @@ static gni_return_t send_smsg_message(SMSG_QUEUE *queue, int destNode, void *msg
             TRACE_COMM_SET_COMM_MSGID(real_data);
         }
 #endif
-        status = GNI_SmsgSendWTag(ep_hndl_array[destNode], buf, bufsize, msg, size, 0, tag);
+#if REMOTE_EVENT
+        if (tag == LMSG_INIT_TAG) {
+            CONTROL_MSG *control_msg_tmp = (CONTROL_MSG*)msg;
+            if (control_msg_tmp->seq_id == 0 && control_msg_tmp->ack_index == -1)
+                control_msg_tmp->ack_index = IndexPool_getslot(&ackPool, (void*)control_msg_tmp->source_addr, 1);
+        }
+        // GNI_EpSetEventData(ep_hndl_array[destNode], destNode, myrank);
+#endif
+#if     CMK_WITH_STATS
+        SMSG_TRY_SEND(tag)
+#endif
+#if CMK_WITH_STATS
+    double              creation_time;
+    if (ptr == NULL)
+        creation_time = CmiWallTimer();
+    else
+        creation_time = ptr->creation_time;
+#endif
+
+    status = GNI_SmsgSendWTag(ep_hndl_array[destNode], NULL, 0, msg, size, 0, tag);
 #if CMK_SMP_TRACE_COMMTHREAD
         if (oldpe != -1)  TRACE_COMM_SET_MSGID(real_data, oldpe, oldeventid);
 #endif
-        CMI_GNI_UNLOCK
+        CMI_GNI_UNLOCK(default_tx_cq_lock)
+        //CMI_GNI_UNLOCK(smsg_mailbox_lock)
         if(status == GNI_RC_SUCCESS)
         {
+#if     CMK_WITH_STATS
+            SMSG_SENT_DONE(creation_time,tag) 
+#endif
 #if CMK_SMP_TRACE_COMMTHREAD
-            if(tag == SMALL_DATA_TAG || tag == LMSG_INIT_TAG || tag == SMALL_DATA_ACK_TAG || tag == LMSG_INIT_ACK_TAG)
+            if(tag == SMALL_DATA_TAG || tag == LMSG_INIT_TAG )
             { 
                 TRACE_COMM_CREATION(CpvAccess(projTraceStart), real_data);
             }
 #endif
-            smsg_send_count ++;
         }else
             status = GNI_RC_ERROR_RESOURCE;
-#if PIGGYBACK_ACK
-        if (buf) {
-            piggyback_ack_done(destNode, buf, status==GNI_RC_SUCCESS);
-            tag = (tag == SMALL_DATA_ACK_TAG) ? SMALL_DATA_TAG : LMSG_INIT_TAG;
-        }
-#endif
     }
     if(status != GNI_RC_SUCCESS && inbuff ==0)
         buffer_small_msgs(queue, msg, size, destNode, tag);
@@ -1279,6 +1592,9 @@ static CONTROL_MSG* construct_control_msg(int size, char *msg, uint8_t seqno)
     control_msg_tmp->source_addr = (uint64_t)msg;
     control_msg_tmp->seq_id    = seqno;
     control_msg_tmp->total_length = control_msg_tmp->length = ALIGN64(size); //for GET 4 bytes aligned 
+#if REMOTE_EVENT
+    control_msg_tmp->ack_index    =  -1;
+#endif
 #if     USE_LRTS_MEMPOOL
     if(size < BIG_MSG)
     {
@@ -1300,8 +1616,7 @@ static CONTROL_MSG* construct_control_msg(int size, char *msg, uint8_t seqno)
 
 // Large message, send control to receiver, receiver register memory and do a GET, 
 // return 1 - send no success
-inline
-static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL_MSG  *control_msg_tmp, int inbuff)
+inline static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL_MSG  *control_msg_tmp, int inbuff, MSG_LIST *smsg_ptr)
 {
     gni_return_t        status  =  GNI_RC_ERROR_NOMEM;
     uint32_t            vmdh_index  = -1;
@@ -1333,7 +1648,7 @@ static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL
                 return GNI_RC_ERROR_NOMEM;
             }
             //register the corresponding mempool
-            status = registerMemory(GetMempoolBlockPtr(msg), GetMempoolsize(msg), &(GetMemHndl(msg)));
+            status = registerMemory(GetMempoolBlockPtr(msg), GetMempoolsize(msg), &(GetMemHndl(msg)), rdma_rx_cqh);
             if(status == GNI_RC_SUCCESS)
             {
                 control_msg_tmp->source_mem_hndl = GetMemHndl(source_addr);
@@ -1343,8 +1658,8 @@ static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL
             control_msg_tmp->source_mem_hndl = GetMemHndl(source_addr);
             status = GNI_RC_SUCCESS;
         }
-        if(NoMsgInSend( control_msg_tmp->source_addr))
-            register_size = GetMempoolsize((void*)(control_msg_tmp->source_addr));
+        if(NoMsgInSend(source_addr))
+            register_size = GetMempoolsize((void*)(source_addr));
         else
             register_size = 0;
     }else if(control_msg_tmp->seq_id >0)    // BIG_MSG
@@ -1365,7 +1680,7 @@ static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL
                     buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, LMSG_INIT_TAG);
                 return GNI_RC_ERROR_NOMEM;
             }
-            status = registerMemory((void*)source_addr, ALIGN64(size), &(control_msg_tmp->source_mem_hndl));
+            status = registerMemory((void*)source_addr, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), NULL);
             if(status == GNI_RC_SUCCESS) buffered_send_msg += ALIGN64(size);
         }
         else
@@ -1375,11 +1690,21 @@ static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL
         register_size = 0;  
     }
 
+#if CMI_EXERT_SEND_CAP
+    if(SEND_large_pending >= SEND_large_cap)
+    {
+        status = GNI_RC_ERROR_NOMEM;
+    }
+#endif
     if(status == GNI_RC_SUCCESS)
     {
-        status = send_smsg_message(queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, LMSG_INIT_TAG, inbuff);  
+       status = send_smsg_message( queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, LMSG_INIT_TAG, inbuff, smsg_ptr); 
         if(status == GNI_RC_SUCCESS)
         {
+#if CMI_EXERT_SEND_CAP
+            SEND_large_pending++;
+#endif
             buffered_send_msg += register_size;
             if(control_msg_tmp->seq_id == 0)
             {
@@ -1401,10 +1726,10 @@ static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL
     }
     return status;
 #else
-    MEMORY_REGISTER(onesided_hnd, nic_hndl,msg, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), &omdh, status)
+    MEMORY_REGISTER(onesided_hnd, nic_hndl,msg, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), &omdh, NULL, status)
     if(status == GNI_RC_SUCCESS)
     {
-        status = send_smsg_message(queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, LMSG_INIT_TAG, 0);  
+        status = send_smsg_message(queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, LMSG_INIT_TAG, 0, NULL);  
         if(status == GNI_RC_SUCCESS)
         {
             FreeControlMsg(control_msg_tmp);
@@ -1423,6 +1748,7 @@ static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL
 inline void LrtsPrepareEnvelope(char *msg, int size)
 {
     CmiSetMsgSize(msg, size);
+    CMI_SET_CHECKSUM(msg, size);
 }
 
 CmiCommHandle LrtsSendFunc(int destNode, int size, char *msg, int mode)
@@ -1445,7 +1771,7 @@ CmiCommHandle LrtsSendFunc(int destNode, int size, char *msg, int mode)
 #if PRINT_SYH
     printf("LrtsSendFn %d==>%d, size=%d\n", myrank, destNode, size);
 #endif 
-#if CMK_SMP && COMM_THREAD_SEND
+#if CMK_SMP 
     if(size <= SMSG_MAX_MSG)
         buffer_small_msgs(queue, msg, size, destNode, SMALL_DATA_TAG);
     else if (size < BIG_MSG) {
@@ -1460,35 +1786,108 @@ CmiCommHandle LrtsSendFunc(int destNode, int size, char *msg, int mode)
 #else   //non-smp, smp(worker sending)
     if(size <= SMSG_MAX_MSG)
     {
-        if (GNI_RC_SUCCESS == send_smsg_message(queue, destNode,  msg, size, SMALL_DATA_TAG, 0))
+        if (GNI_RC_SUCCESS == send_smsg_message(queue, destNode,  msg, size, SMALL_DATA_TAG, 0, NULL))
             CmiFree(msg);
     }
     else if (size < BIG_MSG) {
         control_msg_tmp =  construct_control_msg(size, msg, 0);
-        send_large_messages(queue, destNode, control_msg_tmp, 0);
+        send_large_messages(queue, destNode, control_msg_tmp, 0, NULL);
     }
     else {
 #if     USE_LRTS_MEMPOOL
         CmiSetMsgSeq(msg, 0);
         control_msg_tmp =  construct_control_msg(size, msg, 1);
-        send_large_messages(queue, destNode, control_msg_tmp, 0);
+        send_large_messages(queue, destNode, control_msg_tmp, 0, NULL);
 #else
         control_msg_tmp =  construct_control_msg(size, msg, 0);
-        send_large_messages(queue, destNode, control_msg_tmp, 0);
+        send_large_messages(queue, destNode, control_msg_tmp, 0, NULL);
 #endif
     }
 #endif
     return 0;
 }
 
+void LrtsSyncListSendFn(int npes, int *pes, int len, char *msg)
+{
+  int i;
+#if CMK_BROADCAST_USE_CMIREFERENCE
+  for(i=0;i<npes;i++) {
+    if (pes[i] == CmiMyPe())
+      CmiSyncSend(pes[i], len, msg);
+    else {
+      CmiReference(msg);
+      CmiSyncSendAndFree(pes[i], len, msg);
+    }
+  }
+#else
+  for(i=0;i<npes;i++) {
+    CmiSyncSend(pes[i], len, msg);
+  }
+#endif
+}
+
+CmiCommHandle LrtsAsyncListSendFn(int npes, int *pes, int len, char *msg)
+{
+  /* A better asynchronous implementation may be wanted, but at least it works */
+  CmiSyncListSendFn(npes, pes, len, msg);
+  return (CmiCommHandle) 0;
+}
+
+void LrtsFreeListSendFn(int npes, int *pes, int len, char *msg)
+{
+  if (npes == 1) {
+      CmiSyncSendAndFree(pes[0], len, msg);
+      return;
+  }
+#if CMK_PERSISTENT_COMM
+  if (CpvAccess(phs) && len > PERSIST_MIN_SIZE) {
+      int i;
+      for(i=0;i<npes;i++) {
+        if (pes[i] == CmiMyPe())
+          CmiSyncSend(pes[i], len, msg);
+        else {
+          CmiReference(msg);
+          CmiSyncSendAndFree(pes[i], len, msg);
+        }
+      }
+      CmiFree(msg);
+      return;
+  }
+#endif
+  
+#if CMK_BROADCAST_USE_CMIREFERENCE
+  CmiSyncListSendFn(npes, pes, len, msg);
+  CmiFree(msg);
+#else
+  int i;
+  for(i=0;i<npes-1;i++) {
+    CmiSyncSend(pes[i], len, msg);
+  }
+  if (npes>0)
+    CmiSyncSendAndFree(pes[npes-1], len, msg);
+  else 
+    CmiFree(msg);
+#endif
+}
+
 static void    PumpDatagramConnection();
+static      int         event_SetupConnect = 111;
+static      int         event_PumpSmsg = 222 ;
+static      int         event_PumpTransaction = 333;
+static      int         event_PumpRdmaTransaction = 444;
+static      int         event_SendBufferSmsg = 444;
+static      int         event_SendFmaRdmaMsg = 555;
+static      int         event_AdvanceCommunication = 666;
+
 static void registerUserTraceEvents() {
 #if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && !CMK_TRACE_IN_CHARM
-    traceRegisterUserEvent("setting up connections", 10);
-    traceRegisterUserEvent("Receiving small msgs", 20);
-    traceRegisterUserEvent("Release local transaction", 30);
-    traceRegisterUserEvent("Sending buffered small msgs", 40);
-    traceRegisterUserEvent("Sending buffered rdma msgs", 50);
+    event_SetupConnect = traceRegisterUserEvent("setting up connections", -1 );
+    event_PumpSmsg = traceRegisterUserEvent("Pump network small msgs", -1);
+    event_PumpTransaction = traceRegisterUserEvent("Pump FMA local transaction" , -1);
+    event_PumpRdmaTransaction = traceRegisterUserEvent("Pump RDMA local transaction" , -1);
+    event_SendBufferSmsg = traceRegisterUserEvent("Sending buffered small msgs", -1);
+    event_SendFmaRdmaMsg = traceRegisterUserEvent("Sending buffered fma/rdma transactions", -1);
+    event_AdvanceCommunication = traceRegisterUserEvent("Worker thread in sending/receiving", -1);
 #endif
 }
 
@@ -1555,7 +1954,12 @@ static void set_limit()
         MAX_REG_MEM  = _totalmem / numprocesses;
         MAX_BUFF_SEND = MAX_REG_MEM / 2;
         if (CmiMyPe() == 0)
-           printf("mem_max = %lld, send_max =%lld\n", MAX_REG_MEM, MAX_BUFF_SEND);
+           printf("mem_max = %.2fM, send_max =%.2fM\n", MAX_REG_MEM/1024.0/1024, MAX_BUFF_SEND/1024./1024);
+        if(CmiMyPe() == 0 && (smsg_memlen*mysize + _expand_mem > MAX_BUFF_SEND ||  smsg_memlen*mysize + _mempool_size > MAX_BUFF_SEND))
+        {
+             printf("Charm++> FATAL ERROR your program has risk of hanging \n please try large page or use Dynamic smsg +useDynamicSmsg or contact Charm++ developers\n");
+             CmiAbort("memory registration\n");
+        }
     }
 }
 
@@ -1567,7 +1971,8 @@ void LrtsPostCommonInit(int everReturn)
 #if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && !CMK_TRACE_IN_CHARM
     CpvInitialize(double, projTraceStart);
     /* only PE 0 needs to care about registration (to generate sts file). */
-    if (CmiMyPe() == 0) {
+    //if (CmiMyPe() == 0) 
+    {
         registerMachineUserEventsFunction(&registerUserTraceEvents);
     }
 #endif
@@ -1582,11 +1987,14 @@ void LrtsPostCommonInit(int everReturn)
     CcdCallOnConditionKeep(CcdPERIODIC_10ms, (CcdVoidFn) PumpDatagramConnection, NULL);
 #endif
 
+#if ! LARGEPAGE
     if (_checkProgress)
 #if CMK_SMP
     if (CmiMyRank() == 0)
 #endif
     CcdCallOnConditionKeep(CcdPERIODIC_2minute, (CcdVoidFn) CheckProgress, NULL);
+#endif
 #if !LARGEPAGE
     CcdCallOnCondition(CcdTOPOLOGY_AVAIL, (CcdVoidFn)set_limit, NULL);
 #endif
@@ -1594,35 +2002,63 @@ void LrtsPostCommonInit(int everReturn)
 
 /* this is called by worker thread */
 void LrtsPostNonLocal(){
+#if CMK_SMP_TRACE_COMMTHREAD
+    double startT, endT;
+#endif
 #if MULTI_THREAD_SEND
     if(mysize == 1) return;
-//#if CMK_SMP_TRACE_COMMTHREAD
-//    traceEndIdle();
-//#endif
-    //printf("[%d,%d] worker call communication\n", CmiMyNode(), CmiMyRank());
+#if CMK_SMP_TRACE_COMMTHREAD
+    traceEndIdle();
+#endif
+
+#if CMK_SMP_TRACE_COMMTHREAD
+    startT = CmiWallTimer();
+#endif
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 0)
+#endif
     PumpNetworkSmsg();
-    PumpLocalRdmaTransactions();
-    
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 1)
+#endif
+    PumpLocalTransactions(default_tx_cqh, default_tx_cq_lock);
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 2)
+#endif
+    PumpLocalTransactions(rdma_tx_cqh, rdma_tx_cq_lock);
+
+#if REMOTE_EVENT
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 3)
+#endif
+    PumpRemoteTransactions();
+#endif
+
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 4)
+#endif
 #if CMK_USE_OOB
     if (SendBufferMsg(&smsg_oob_queue) == 1)
 #endif
     {
-#if PIGGYBACK_ACK
-    //if (count%10 == 0) SendBufferMsg(&smsg_ack_queue);
-    if (SendBufferMsg(&smsg_queue) == 1) {
-        //if (count++ % 10 == 0) 
-        SendBufferMsg(&smsg_ack_queue);
-    }
-#else
-    SendBufferMsg(&smsg_queue);
-#endif
+        SendBufferMsg(&smsg_queue);
     }
 
+#if CMK_WORKER_SINGLE_TASK
+    if (CmiMyRank() % 6 == 5)
+#endif
     SendRdmaMsg();
-    //LrtsAdvanceCommunication(1);
-//#if CMK_SMP_TRACE_COMMTHREAD
-//    traceBeginIdle();
-//#endif
+
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    traceUserBracketEvent(event_AdvanceCommunication, startT, endT);
+#endif
+#if CMK_SMP_TRACE_COMMTHREAD
+    traceBeginIdle();
+#endif
 #endif
 }
 
@@ -1640,16 +2076,16 @@ static void    PumpDatagramConnection()
    {
        if (datagram_id >= mysize) {           /* bound endpoint */
            int pe = datagram_id - mysize;
-           CMI_GNI_LOCK
+           CMI_GNI_LOCK(global_gni_lock)
            status = GNI_EpPostDataTestById( ep_hndl_array[pe], datagram_id, &post_state, &remote_address, &remote_id);
-           CMI_GNI_UNLOCK
+           CMI_GNI_UNLOCK(global_gni_lock)
            if(status == GNI_RC_SUCCESS && post_state == GNI_POST_COMPLETED)
            {
                CmiAssert(remote_id == pe);
                status = GNI_SmsgInit(ep_hndl_array[pe], smsg_attr_vector_local[pe], smsg_attr_vector_remote[pe]);
                GNI_RC_CHECK("Dynamic SMSG Init", status);
 #if PRINT_SYH
-               printf("++ Dynamic SMSG setup [%d===>%d] done\n", myrank, pe);
+               printf("[%d] ++ Dynamic SMSG setup [%d===>%d] done\n", myrank, myrank, pe);
 #endif
               CmiAssert(smsg_connected_flag[pe] == 1);
                smsg_connected_flag[pe] = 2;
@@ -1664,7 +2100,7 @@ static void    PumpDatagramConnection()
                status = GNI_SmsgInit(ep_hndl_array[remote_id], &send_smsg_attr, &recv_smsg_attr);
                GNI_RC_CHECK("Dynamic SMSG Init", status);
 #if PRINT_SYH
-               printf("++ Dynamic SMSG setup2 [%d===>%d] done\n", myrank, remote_id);
+               printf("[%d] ++ Dynamic SMSG setup2 [%d===>%d] done\n", myrank, myrank, remote_id);
 #endif
                smsg_connected_flag[remote_id] = 2;
 
@@ -1685,12 +2121,15 @@ static void PumpNetworkRdmaMsgs()
 }
 
 inline 
-static void bufferRdmaMsg(int inst_id, gni_post_descriptor_t *pd)
+static void bufferRdmaMsg(int inst_id, gni_post_descriptor_t *pd, int ack_index)
 {
     RDMA_REQUEST        *rdma_request_msg;
     MallocRdmaRequest(rdma_request_msg);
     rdma_request_msg->destNode = inst_id;
     rdma_request_msg->pd = pd;
+#if REMOTE_EVENT
+    rdma_request_msg->ack_index = ack_index;
+#endif
 #if CMK_SMP
     PCQueuePush(sendRdmaBuf, (char*)rdma_request_msg);
 #else
@@ -1705,25 +2144,6 @@ static void bufferRdmaMsg(int inst_id, gni_post_descriptor_t *pd)
 
 }
 
-#if PIGGYBACK_ACK
-int processPiggybackAckHeader(void *header)
-{
-    int i;
-    uint64_t *buf = (uint64_t*)header;
-    int piggycount = buf[0];
-//printf("[%d] got piggyback msg: %d\n", myrank, piggycount);
-    for (i=0; i<piggycount; i++) {
-        void *msg = (void*)(buf[i+1]);
-        CmiAssert(msg != NULL);
-        DecreaseMsgInSend(msg);
-        if(NoMsgInSend(msg))
-            buffered_send_msg -= GetMempoolsize(msg);
-        CmiFree(msg);
-    }
-    return piggycount;
-}
-#endif
-
 static void getLargeMsgRequest(void* header, uint64_t inst_id);
 
 static void PumpNetworkSmsg()
@@ -1743,17 +2163,23 @@ static void PumpNetworkSmsg()
     CONTROL_MSG         *control_msg_tmp, *header_tmp;
     uint64_t            source_addr;
     SMSG_QUEUE         *queue = &smsg_queue;
-#if     CMK_DIRECT
+#if   CMK_DIRECT
     cmidirectMsg        *direct_msg;
+#endif
+#if CMI_EXERT_RECV_CAP
+    int                  recv_cnt = 0;
 #endif
     while(1)
     {
-        CMI_GNI_LOCK
+        CMI_GNI_LOCK(smsg_rx_cq_lock)
         status =GNI_CqGetEvent(smsg_rx_cqh, &event_data);
-        CMI_GNI_UNLOCK
+        CMI_GNI_UNLOCK(smsg_rx_cq_lock)
         if(status != GNI_RC_SUCCESS)
             break;
         inst_id = GNI_CQ_GET_INST_ID(event_data);
+#if REMOTE_EVENT
+        inst_id = GET_RANK(inst_id);      /* important */
+#endif
         // GetEvent returns success but GetNext return not_done. caused by Smsg out-of-order transfer
 #if PRINT_SYH
         printf("[%d] %d PumpNetworkMsgs is received from PE: %d,  status=%s\n", myrank, CmiMyRank(), inst_id,  gni_err_str[status]);
@@ -1765,25 +2191,18 @@ static void PumpNetworkSmsg()
         }
         msg_tag = GNI_SMSG_ANY_TAG;
         while(1) {
-            CMI_GNI_LOCK
+            CMI_GNI_LOCK(smsg_mailbox_lock)
             status = GNI_SmsgGetNextWTag(ep_hndl_array[inst_id], &header, &msg_tag);
             if (status != GNI_RC_SUCCESS)
             {
-                CMI_GNI_UNLOCK
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
                 break;
             }
 #if PRINT_SYH
-            printf("[%d] from %d request for Large msg is received, messageid: tag=%d\n", myrank, inst_id, msg_tag);
+            printf("[%d] from %d smsg msg is received, messageid: tag=%d\n", myrank, inst_id, msg_tag);
 #endif
             /* copy msg out and then put into queue (small message) */
             switch (msg_tag) {
-#if PIGGYBACK_ACK
-            case SMALL_DATA_ACK_TAG:
-            {
-                int piggycount = processPiggybackAckHeader(header);
-                header = (uint64_t*)header + piggycount + 1;
-            }
-#endif
             case SMALL_DATA_TAG:
             {
                 START_EVENT();
@@ -1791,41 +2210,35 @@ static void PumpNetworkSmsg()
                 msg_data    = CmiAlloc(msg_nbytes);
                 memcpy(msg_data, (char*)header, msg_nbytes);
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
-                CMI_GNI_UNLOCK
-#if CMK_SMP_TRACE_COMMTHREAD
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
                 TRACE_COMM_CREATION(CpvAccess(projTraceStart), msg_data);
-#endif
+                CMI_CHECK_CHECKSUM(msg_data, msg_nbytes);
                 handleOneRecvedMsg(msg_nbytes, msg_data);
                 break;
             }
-#if PIGGYBACK_ACK
-            case LMSG_INIT_ACK_TAG:
-            {
-                int piggycount = processPiggybackAckHeader(header);
-                header = (uint64_t*)header + piggycount + 1;
-            }
-#endif
-            case LMSG_INIT_TAG:
+            case LMSG_INIT_TAG:
             {
 #if MULTI_THREAD_SEND
                 MallocControlMsg(control_msg_tmp);
                 memcpy(control_msg_tmp, header, CONTROL_MSG_SIZE);
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
-                CMI_GNI_UNLOCK
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
                 getLargeMsgRequest(control_msg_tmp, inst_id);
                 FreeControlMsg(control_msg_tmp);
 #else
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
                 getLargeMsgRequest(header, inst_id);
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
 #endif
                 break;
             }
+#if !REMOTE_EVENT && !CQWRITE
             case ACK_TAG:   //msg fit into mempool
             {
                 /* Get is done, release message . Now put is not used yet*/
                 void *msg = (void*)(((ACK_MSG *)header)->source_addr);
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
-                CMI_GNI_UNLOCK
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
 #if ! USE_LRTS_MEMPOOL
                 MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(((ACK_MSG *)header)->source_mem_hndl), &omdh, ((ACK_MSG *)header)->length);
 #else
@@ -1835,18 +2248,24 @@ static void PumpNetworkSmsg()
                     buffered_send_msg -= GetMempoolsize(msg);
                 MACHSTATE5(8, "GO send done to %d (%d,%d, %d) tag=%d\n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size, msg_tag); 
                 CmiFree(msg);
+#if CMI_EXERT_SEND_CAP
+                SEND_large_pending--;
+#endif
                 break;
             }
+#endif
             case BIG_MSG_TAG:  //big msg, de-register, transfer next seg
             {
 #if MULTI_THREAD_SEND
                 MallocControlMsg(header_tmp);
                 memcpy(header_tmp, header, CONTROL_MSG_SIZE);
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
-                CMI_GNI_UNLOCK
-                   /* FIXME: leak */
 #else
                 header_tmp = (CONTROL_MSG *) header;
+#endif
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
+#if CMI_EXERT_SEND_CAP
+                    SEND_large_pending--;
 #endif
                 void *msg = (void*)(header_tmp->source_addr);
                 int cur_seq = CmiGetMsgSeq(msg);
@@ -1865,7 +2284,7 @@ static void PumpNetworkSmsg()
                     control_msg_tmp = construct_control_msg(header_tmp->total_length, msg, cur_seq+1+1);
                     control_msg_tmp->dest_addr = header_tmp->dest_addr;
                     //send next seg
-                    send_large_messages(queue, inst_id, control_msg_tmp, 0);
+                    send_large_messages( queue, inst_id, control_msg_tmp, 0, NULL);
                          // pipelining
                     if (header_tmp->seq_id == 1) {
                       int i;
@@ -1874,7 +2293,7 @@ static void PumpNetworkSmsg()
                         CmiSetMsgSeq(msg, seq-1);
                         control_msg_tmp =  construct_control_msg(header_tmp->total_length, (char *)msg, seq);
                         control_msg_tmp->dest_addr = header_tmp->dest_addr;
-                        send_large_messages(queue, inst_id, control_msg_tmp, 0);
+                        send_large_messages( queue, inst_id, control_msg_tmp, 0, NULL);
                         if (header_tmp->total_length <= ONE_SEG*seq) break;
                       }
                     }
@@ -1886,15 +2305,20 @@ static void PumpNetworkSmsg()
 #endif
                 break;
             }
-#if CMK_PERSISTENT_COMM
-            case PUT_DONE_TAG: //persistent message
-                void *msg = (void *)((CONTROL_MSG *) header)->source_addr;
+#if CMK_PERSISTENT_COMM && !REMOTE_EVENT && !CQWRITE
+            case PUT_DONE_TAG:  {   //persistent message
+                void *msg = (void *)(((CONTROL_MSG *) header)->source_addr);
                 int size = ((CONTROL_MSG *) header)->length;
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
-                CMI_GNI_UNLOCK
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
                 CmiReference(msg);
+                CMI_CHECK_CHECKSUM(msg, size);
                 handleOneRecvedMsg(size, msg); 
+#if PRINT_SYH
+                printf("[%d] PUT_DONE_TAG hand over one message, size: %d. \n", myrank, size);
+#endif
                 break;
+            }
 #endif
 #if CMK_DIRECT
             case DIRECT_PUT_DONE_TAG:  //cmi direct 
@@ -1902,25 +2326,27 @@ static void PumpNetworkSmsg()
                 direct_msg = (cmidirectMsg*)CmiAlloc(sizeof(cmidirectMsg));
                 direct_msg->handler = ((CMK_DIRECT_HEADER*)header)->handler_addr;
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
-                CMI_GNI_UNLOCK
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
                 CmiSetHandler(direct_msg, CpvAccess(CmiHandleDirectIdx));
                 CmiPushPE(((CmiDirectUserHandle*)direct_msg->handler)->remoteRank, direct_msg);
                 //(*(((CMK_DIRECT_HEADER*) header)->callbackFnPtr))(((CMK_DIRECT_HEADER*) header)->callbackData);
                 break;
 #endif
-            default: {
+            default:
                 GNI_SmsgRelease(ep_hndl_array[inst_id]);
-                CMI_GNI_UNLOCK
+                CMI_GNI_UNLOCK(smsg_mailbox_lock)
                 printf("weird tag problem\n");
                 CmiAbort("Unknown tag\n");
-                     }
             }               // end switch
 #if PRINT_SYH
-            printf("[%d] from %d after switch request for Large msg is received, messageid: tag=%d\n", myrank, inst_id, msg_tag);
+            printf("[%d] from %d after switch request for smsg is received, messageid: tag=%d\n", myrank, inst_id, msg_tag);
 #endif
             smsg_recv_count ++;
             msg_tag = GNI_SMSG_ANY_TAG;
-        } //endwhile getNext
+#if CMI_EXERT_RECV_CAP
+            if (status == GNI_RC_SUCCESS && ++recv_cnt == RECV_CAP) return;
+#endif
+        } //endwhile GNI_SmsgGetNextWTag
     }   //end while GetEvent
     if(status == GNI_RC_ERROR_RESOURCE)
     {
@@ -1934,6 +2360,64 @@ static void printDesc(gni_post_descriptor_t *pd)
     printf(" Descriptor (%p===>%p)(%d)\n", pd->local_addr, pd->remote_addr, pd->length); 
 }
 
+#if CQWRITE
+static void sendCqWrite(int destNode, uint64_t data, gni_mem_handle_t mem_hndl)
+{
+    gni_post_descriptor_t *pd;
+    gni_return_t        status = GNI_RC_SUCCESS;
+    
+    MallocPostDesc(pd);
+    pd->type = GNI_POST_CQWRITE;
+    pd->cq_mode = GNI_CQMODE_SILENT;
+    //pd->cq_mode = GNI_CQMODE_GLOBAL_EVENT | GNI_CQMODE_REMOTE_EVENT ;
+    pd->dlvr_mode = GNI_DLVMODE_PERFORMANCE;
+    pd->cqwrite_value = data;
+    pd->remote_mem_hndl = mem_hndl;
+    status = GNI_PostCqWrite(ep_hndl_array[destNode], pd);
+    GNI_RC_CHECK("GNI_PostCqWrite", status);
+}
+#endif
+
+// register memory for a message
+// return mem handle
+static gni_return_t  registerMessage(void *msg, int size, int seqno, gni_mem_handle_t *memh)
+{
+    gni_return_t status = GNI_RC_SUCCESS;
+
+    if (!IsMemHndlZero(*memh)) return GNI_RC_SUCCESS;
+
+#if CMK_PERSISTENT_COMM
+      // persistent message is always registered
+      // BIG_MSG small pieces do not have malloc chunk header
+    if (IS_PERSISTENT_MEMORY(msg)) {
+        *memh = GetMemHndl(msg);
+        return GNI_RC_SUCCESS;
+    }
+#endif
+    if(seqno == 0 
+#if CMK_PERSISTENT_COMM
+         || seqno == PERSIST_SEQ
+#endif
+      )
+    {
+        if(IsMemHndlZero((GetMemHndl(msg))))
+        {
+            msg = (void*)(msg);
+            status = registerMemory(GetMempoolBlockPtr(msg), GetMempoolsize(msg), &(GetMemHndl(msg)), rdma_rx_cqh);
+            if(status == GNI_RC_SUCCESS)
+                *memh = GetMemHndl(msg);
+        }
+        else {
+            *memh = GetMemHndl(msg);
+        }
+    }
+    else {
+        //big msg, can not fit into memory pool, or CmiDirect Msg (which is not from mempool)
+        status = registerMemory(msg, size, memh, NULL); 
+    }
+    return status;
+}
+
 // for BIG_MSG called on receiver side for receiving control message
 // LMSG_INIT_TAG
 static void getLargeMsgRequest(void* header, uint64_t inst_id )
@@ -1944,14 +2428,21 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
     void                *msg_data;
     gni_post_descriptor_t *pd;
     gni_mem_handle_t    msg_mem_hndl;
-    int source, size, transaction_size, offset = 0;
-    size_t     register_size = 0;
+    int                 size, transaction_size, offset = 0;
+    size_t              register_size = 0;
 
     // initial a get to transfer data from the sender side */
     request_msg = (CONTROL_MSG *) header;
     size = request_msg->total_length;
     MACHSTATE4(8, "GO Get request from %d (%d,%d, %d) \n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size); 
+    MallocPostDesc(pd);
+#if CMK_WITH_STATS 
+    pd->sync_flag_addr = 1000000 * CmiWallTimer(); //microsecond
+#endif
     if(request_msg->seq_id < 2)   {
+#if CMK_SMP_TRACE_COMMTHREAD 
+        pd->sync_flag_addr = 1000000 * CmiWallTimer(); //microsecond
+#endif
         msg_data = CmiAlloc(size);
         CmiSetMsgSeq(msg_data, 0);
         _MEMCHECK(msg_data);
@@ -1961,69 +2452,66 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
         msg_data = (char*)request_msg->dest_addr + offset;
     }
    
-    MallocPostDesc(pd);
     pd->cqwrite_value = request_msg->seq_id;
-    if( request_msg->seq_id == 0)
-    {
-        pd->local_mem_hndl= GetMemHndl(msg_data);
-        transaction_size = ALIGN64(size);
-        if(IsMemHndlZero(pd->local_mem_hndl))
-        {   
-            status = registerMemory( GetMempoolBlockPtr(msg_data), GetMempoolsize(msg_data), &(GetMemHndl(msg_data)));
-            if(status == GNI_RC_SUCCESS)
-            {
-                pd->local_mem_hndl = GetMemHndl(msg_data);
-            }
-            else
-            {
-                SetMemHndlZero(pd->local_mem_hndl);
-            }
-        }
+
+    transaction_size = request_msg->seq_id == 0? ALIGN64(size) : ALIGN64(request_msg->length);
+    SetMemHndlZero(pd->local_mem_hndl);
+    status = registerMessage(msg_data, transaction_size, request_msg->seq_id, &pd->local_mem_hndl);
+    if (status == GNI_RC_SUCCESS && request_msg->seq_id == 0) {
         if(NoMsgInRecv( (void*)(msg_data)))
             register_size = GetMempoolsize((void*)(msg_data));
-        else
-            register_size = 0;
-    }
-    else{
-        transaction_size = ALIGN64(request_msg->length);
-        status = registerMemory(msg_data, transaction_size, &(pd->local_mem_hndl)); 
-        if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR) 
-        {
-            GNI_RC_CHECK("Invalid/permission Mem Register in post", status);
-        }
     }
+
     pd->first_operand = ALIGN64(size);                   //  total length
 
     if(request_msg->total_length <= LRTS_GNI_RDMA_THRESHOLD)
         pd->type            = GNI_POST_FMA_GET;
     else
         pd->type            = GNI_POST_RDMA_GET;
-#if REMOTE_EVENT
-    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT |  GNI_CQMODE_REMOTE_EVENT;
-#else
     pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
-#endif
     pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
     pd->length          = transaction_size;
     pd->local_addr      = (uint64_t) msg_data;
     pd->remote_addr     = request_msg->source_addr + offset;
     pd->remote_mem_hndl = request_msg->source_mem_hndl;
-    pd->src_cq_hndl     = 0;//post_tx_cqh;     /* smsg_tx_cqh;  */
+    pd->src_cq_hndl     = rdma_tx_cqh;
     pd->rdma_mode       = 0;
     pd->amo_cmd         = 0;
-
+#if CMI_EXERT_RDMA_CAP
+    if(status == GNI_RC_SUCCESS && RDMA_pending >= RDMA_cap ) status = GNI_RC_ERROR_RESOURCE; 
+#endif
     //memory registration success
-    if(status == GNI_RC_SUCCESS)
+    if(status == GNI_RC_SUCCESS )
     {
-        CMI_GNI_LOCK
+        CmiNodeLock lock = pd->type == GNI_POST_RDMA_GET?rdma_tx_cq_lock:default_tx_cq_lock;
+        CMI_GNI_LOCK(lock)
+#if REMOTE_EVENT
+        if( request_msg->seq_id == 0)
+        {
+            pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+            int sts = GNI_EpSetEventData(ep_hndl_array[inst_id], inst_id, ACK_EVENT(request_msg->ack_index));
+            GNI_RC_CHECK("GNI_EpSetEventData", sts);
+        }
+#endif
+
+#if CMK_WITH_STATS
+        RDMA_TRY_SEND(pd->type)
+#endif
         if(pd->type == GNI_POST_RDMA_GET) 
+        {
             status = GNI_PostRdma(ep_hndl_array[inst_id], pd);
+        }
         else
+        {
             status = GNI_PostFma(ep_hndl_array[inst_id],  pd);
-        CMI_GNI_UNLOCK
+        }
+        CMI_GNI_UNLOCK(lock)
 
         if(status == GNI_RC_SUCCESS )
         {
+#if CMI_EXERT_RDMA_CAP
+            RDMA_pending++;
+#endif
             if(pd->cqwrite_value == 0)
             {
 #if MACHINE_DEBUG_LOG
@@ -2031,7 +2519,14 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
                 MACHSTATE4(8, "GO request from %d (%d,%d, %d)\n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size); 
 #endif
                 IncreaseMsgInRecv(msg_data);
+#if CMK_SMP_TRACE_COMMTHREAD 
+                pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+#endif
             }
+#if  CMK_WITH_STATS
+            pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+            RDMA_TRANS_INIT(pd->type, pd->sync_flag_addr/1000000.0)
+#endif
         }
     }else
     {
@@ -2039,9 +2534,13 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
     }
     if(status == GNI_RC_ERROR_RESOURCE|| status == GNI_RC_ERROR_NOMEM )
     {
-        bufferRdmaMsg(inst_id, pd); 
-    }else {
-         //printf("source: %d pd:(%p,%p)(%p,%p)\n", source, (pd->local_mem_hndl).qword1, (pd->local_mem_hndl).qword2, (pd->remote_mem_hndl).qword1, (pd->remote_mem_hndl).qword2);
+#if REMOTE_EVENT
+        bufferRdmaMsg(inst_id, pd, request_msg->ack_index); 
+#else
+        bufferRdmaMsg(inst_id, pd, -1); 
+#endif
+    }else if (status != GNI_RC_SUCCESS) {
+        // printf("source: %d pd:(%p,%p)(%p,%p) len:%d local:%x remote:%x\n", (int)inst_id, (pd->local_mem_hndl).qword1, (pd->local_mem_hndl).qword2, (pd->remote_mem_hndl).qword1, (pd->remote_mem_hndl).qword2, pd->length, pd->local_addr, pd->remote_addr);
         GNI_RC_CHECK("GetLargeAFter posting", status);
     }
 #else
@@ -2057,7 +2556,7 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
     msg_data = CmiAlloc(request_msg->length);
     _MEMCHECK(msg_data);
 
-    MEMORY_REGISTER(onesided_hnd, nic_hndl, msg_data, request_msg->length, &msg_mem_hndl, &omdh, status)
+    MEMORY_REGISTER(onesided_hnd, nic_hndl, msg_data, request_msg->length, &msg_mem_hndl, &omdh, NULL,  status)
 
     if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR) 
     {
@@ -2069,17 +2568,13 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
         pd->type            = GNI_POST_FMA_GET;
     else
         pd->type            = GNI_POST_RDMA_GET;
-#if REMOTE_EVENT
-    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT |  GNI_CQMODE_REMOTE_EVENT;
-#else
-    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
-#endif
+    pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;// |  GNI_CQMODE_REMOTE_EVENT;
     pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
     pd->length          = ALIGN64(request_msg->length);
     pd->local_addr      = (uint64_t) msg_data;
     pd->remote_addr     = request_msg->source_addr;
     pd->remote_mem_hndl = request_msg->source_mem_hndl;
-    pd->src_cq_hndl     = 0;//post_tx_cqh;     /* smsg_tx_cqh;  */
+    pd->src_cq_hndl     = rdma_tx_cqh;
     pd->rdma_mode       = 0;
     pd->amo_cmd         = 0;
 
@@ -2087,12 +2582,20 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
     if(status == GNI_RC_SUCCESS)
     {
         pd->local_mem_hndl  = msg_mem_hndl;
-        CMI_GNI_LOCK
+       
         if(pd->type == GNI_POST_RDMA_GET) 
+        {
+            CMI_GNI_LOCK(rdma_tx_cq_lock)
             status = GNI_PostRdma(ep_hndl_array[inst_id], pd);
+            CMI_GNI_UNLOCK(rdma_tx_cq_lock)
+        }
         else
+        {
+            CMI_GNI_LOCK(default_tx_cq_lock)
             status = GNI_PostFma(ep_hndl_array[inst_id],  pd);
-        CMI_GNI_UNLOCK
+            CMI_GNI_UNLOCK(default_tx_cq_lock)
+        }
+
     }else
     {
         SetMemHndlZero(pd->local_mem_hndl);
@@ -2110,7 +2613,123 @@ static void getLargeMsgRequest(void* header, uint64_t inst_id )
 #endif
 }
 
-static void PumpLocalRdmaTransactions()
+#if CQWRITE
+static void PumpCqWriteTransactions()
+{
+
+    gni_cq_entry_t          ev;
+    gni_return_t            status;
+    void                    *msg;  
+    int                     msg_size;
+    while(1) {
+        //CMI_GNI_LOCK(my_cq_lock) 
+        status = GNI_CqGetEvent(rdma_rx_cqh, &ev);
+        //CMI_GNI_UNLOCK(my_cq_lock)
+        if(status != GNI_RC_SUCCESS) break;
+        msg = (void*) ( GNI_CQ_GET_DATA(ev) & 0xFFFFFFFFFFFFL);
+#if CMK_PERSISTENT_COMM
+#if PRINT_SYH
+        printf(" %d CQ write event %p\n", myrank, msg);
+#endif
+        if (!IsMemHndlZero(MEMHFIELD(msg))) {
+#if PRINT_SYH
+            printf(" %d Persistent CQ write event %p\n", myrank, msg);
+#endif
+            CmiReference(msg);
+            msg_size = CmiGetMsgSize(msg);
+            CMI_CHECK_CHECKSUM(msg, msg_size);
+            handleOneRecvedMsg(msg_size, msg); 
+            continue;
+        }
+#endif
+#if ! USE_LRTS_MEMPOOL
+       // MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(((ACK_MSG *)header)->source_mem_hndl), &omdh, ((ACK_MSG *)header)->length);
+#else
+        DecreaseMsgInSend(msg);
+#endif
+        if(NoMsgInSend(msg))
+            buffered_send_msg -= GetMempoolsize(msg);
+        CmiFree(msg);
+    };
+    if(status == GNI_RC_ERROR_RESOURCE)
+    {
+        GNI_RC_CHECK("rdma_rx_cq full too many ack", status);
+    }
+}
+#endif
+
+#if REMOTE_EVENT
+static void PumpRemoteTransactions()
+{
+    gni_cq_entry_t          ev;
+    gni_return_t            status;
+    void                    *msg;   
+    int                     inst_id, index, type, size;
+
+    while(1) {
+        CMI_GNI_LOCK(rdma_tx_cq_lock)
+//        CMI_GNI_LOCK(global_gni_lock)
+        status = GNI_CqGetEvent(rdma_rx_cqh, &ev);
+//        CMI_GNI_UNLOCK(global_gni_lock)
+        CMI_GNI_UNLOCK(rdma_tx_cq_lock)
+
+        if(status != GNI_RC_SUCCESS) break;
+
+        inst_id = GNI_CQ_GET_INST_ID(ev);
+        index = GET_INDEX(inst_id);
+        type = GET_TYPE(inst_id);
+        switch (type) {
+        case 0:    // ACK
+            CmiAssert(index>=0 && index<ackPool.size);
+            CMI_GNI_LOCK(ackPool.lock);
+            CmiAssert(GetIndexType(ackPool, index) == 1);
+            msg = GetIndexAddress(ackPool, index);
+            CMI_GNI_UNLOCK(ackPool.lock);
+#if PRINT_SYH
+            printf("[%d] PumpRemoteTransactions: ack: %p index: %d type: %d.\n", myrank, GetMempoolBlockPtr(msg), index, type);
+#endif
+#if ! USE_LRTS_MEMPOOL
+           // MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(((ACK_MSG *)header)->source_mem_hndl), &omdh, ((ACK_MSG *)header)->length);
+#else
+            DecreaseMsgInSend(msg);
+#endif
+            if(NoMsgInSend(msg))
+                buffered_send_msg -= GetMempoolsize(msg);
+            CmiFree(msg);
+            IndexPool_freeslot(&ackPool, index);
+#if CMI_EXERT_SEND_CAP
+            SEND_large_pending--;
+#endif
+            break;
+#if CMK_PERSISTENT_COMM
+        case 1:  {    // PERSISTENT
+            CmiLock(persistPool.lock);
+            CmiAssert(GetIndexType(persistPool, index) == 2);
+            PersistentReceivesTable *slot = GetIndexAddress(persistPool, index);
+            CmiUnlock(persistPool.lock);
+            START_EVENT();
+            msg = slot->destBuf[0].destAddress;
+            size = CmiGetMsgSize(msg);
+            CmiReference(msg);
+            CMI_CHECK_CHECKSUM(msg, size);
+            TRACE_COMM_CREATION(CpvAccess(projTraceStart), msg);
+            handleOneRecvedMsg(size, msg); 
+            break;
+            }
+#endif
+        default:
+            fprintf(stderr, "[%d] PumpRemoteTransactions: unknown type: %d\n", myrank, type);
+            CmiAbort("PumpRemoteTransactions: unknown type");
+        }
+    }
+    if(status == GNI_RC_ERROR_RESOURCE)
+    {
+        GNI_RC_CHECK("rdma_rx_cq full too many ack", status);
+    }
+}
+#endif
+
+static void PumpLocalTransactions(gni_cq_handle_t my_tx_cqh, CmiNodeLock my_cq_lock)
 {
     gni_cq_entry_t          ev;
     gni_return_t            status;
@@ -2126,21 +2745,26 @@ static void PumpLocalRdmaTransactions()
     SMSG_QUEUE         *queue = &smsg_queue;
 
     while(1) {
-        CMI_GNI_LOCK 
-        status = GNI_CqGetEvent(smsg_tx_cqh, &ev);
-        CMI_GNI_UNLOCK
+        CMI_GNI_LOCK(my_cq_lock) 
+        status = GNI_CqGetEvent(my_tx_cqh, &ev);
+        CMI_GNI_UNLOCK(my_cq_lock)
         if(status != GNI_RC_SUCCESS) break;
         
         type = GNI_CQ_GET_TYPE(ev);
         if (type == GNI_CQ_EVENT_TYPE_POST)
         {
+
+#if CMI_EXERT_RDMA_CAP
+            if(RDMA_pending <=0) CmiAbort(" pending error\n");
+            RDMA_pending--;
+#endif
             inst_id     = GNI_CQ_GET_INST_ID(ev);
 #if PRINT_SYH
             printf("[%d] LocalTransactions localdone=%d\n", myrank,  lrts_local_done_msg);
 #endif
-            CMI_GNI_LOCK
-            status = GNI_GetCompleted(smsg_tx_cqh, ev, &tmp_pd);
-            CMI_GNI_UNLOCK
+            CMI_GNI_LOCK(my_cq_lock)
+            status = GNI_GetCompleted(my_tx_cqh, ev, &tmp_pd);
+            CMI_GNI_UNLOCK(my_cq_lock)
 
             switch (tmp_pd->type) {
 #if CMK_PERSISTENT_COMM || CMK_DIRECT
@@ -2150,17 +2774,29 @@ static void PumpLocalRdmaTransactions()
 #endif
             case GNI_POST_FMA_PUT:
                 if(tmp_pd->amo_cmd == 1) {
+#if CMK_DIRECT
                     //sender ACK to receiver to trigger it is done
                     cmk_direct_done_msg = (CMK_DIRECT_HEADER*) malloc(sizeof(CMK_DIRECT_HEADER));
                     cmk_direct_done_msg->handler_addr = tmp_pd->first_operand;
                     msg_tag = DIRECT_PUT_DONE_TAG;
+#endif
                 }
                 else {
                     CmiFree((void *)tmp_pd->local_addr);
+#if REMOTE_EVENT
+                    FreePostDesc(tmp_pd);
+                    continue;
+#elif CQWRITE
+                    sendCqWrite(inst_id, tmp_pd->remote_addr, tmp_pd->remote_mem_hndl);
+                    FreePostDesc(tmp_pd);
+                    continue;
+#else
                     MallocControlMsg(ack_msg_tmp);
                     ack_msg_tmp->source_addr = tmp_pd->remote_addr;
                     ack_msg_tmp->source_mem_hndl    = tmp_pd->remote_mem_hndl;
+                    ack_msg_tmp->length  = tmp_pd->length;
                     msg_tag = PUT_DONE_TAG;
+#endif
                 }
                 break;
 #endif
@@ -2173,6 +2809,9 @@ static void PumpLocalRdmaTransactions()
                 MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &tmp_pd->local_mem_hndl, &omdh, tmp_pd->length)
                 msg_tag = ACK_TAG;  
 #else
+#if CMK_WITH_STATS
+                RDMA_TRANS_DONE(tmp_pd->sync_flag_value/1000000.0)
+#endif
                 int seq_id = tmp_pd->cqwrite_value;
                 if(seq_id > 0)      // BIG_MSG
                 {
@@ -2189,36 +2828,42 @@ static void PumpLocalRdmaTransactions()
                 } 
                 else
                 {
+                    msg_tag = ACK_TAG; 
+#if  !REMOTE_EVENT && !CQWRITE
                     MallocAckMsg(ack_msg);
                     ack_msg->source_addr = tmp_pd->remote_addr;
-                    msg_tag = ACK_TAG;  
-                    // ack_msg_tmp->dest_addr = tmp_pd->local_addr; ???
+#endif
                 }
 #endif
                 break;
             }
+            case  GNI_POST_CQWRITE:
+                   FreePostDesc(tmp_pd);
+                   continue;
             default:
                 CmiPrintf("type=%d\n", tmp_pd->type);
-                CmiAbort("PumpLocalRdmaTransactions: unknown type!");
+                CmiAbort("PumpLocalTransactions: unknown type!");
             }      /* end of switch */
 
 #if CMK_DIRECT
             if (tmp_pd->amo_cmd == 1) {
-                status = send_smsg_message(queue, inst_id, cmk_direct_done_msg, sizeof(CMK_DIRECT_HEADER), msg_tag, 0); 
+                status = send_smsg_message(queue, inst_id, cmk_direct_done_msg, sizeof(CMK_DIRECT_HEADER), msg_tag, 0, NULL); 
                 if (status == GNI_RC_SUCCESS) free(cmk_direct_done_msg); 
             }
             else
 #endif
             if (msg_tag == ACK_TAG) {
-#if ! PIGGYBACK_ACK
-                status = send_smsg_message(queue, inst_id, ack_msg, ACK_MSG_SIZE, msg_tag, 0); 
+#if !REMOTE_EVENT
+#if   !CQWRITE
+                status = send_smsg_message(queue, inst_id, ack_msg, ACK_MSG_SIZE, msg_tag, 0, NULL); 
                 if (status == GNI_RC_SUCCESS) FreeAckMsg(ack_msg);
 #else
-                buffer_small_msgs(&smsg_ack_queue, ack_msg, ACK_MSG_SIZE, inst_id, msg_tag);
+                sendCqWrite(inst_id, tmp_pd->remote_addr, tmp_pd->remote_mem_hndl); 
+#endif
 #endif
             }
             else {
-                status = send_smsg_message(queue, inst_id, ack_msg_tmp, CONTROL_MSG_SIZE, msg_tag, 0); 
+                status = send_smsg_message(queue, inst_id, ack_msg_tmp, CONTROL_MSG_SIZE, msg_tag, 0, NULL); 
                 if (status == GNI_RC_SUCCESS) FreeControlMsg(ack_msg_tmp);
             }
 #if CMK_PERSISTENT_COMM
@@ -2227,8 +2872,11 @@ static void PumpLocalRdmaTransactions()
             {
                 if( msg_tag == ACK_TAG){    //msg fit in mempool 
 #if PRINT_SYH
-                    printf("Normal msg transaction PE:%d==>%d\n", myrank, inst_id);
+                    printf("PumpLocalTransactions: Normal msg transaction PE:%d==>%d\n", myrank, inst_id);
 #endif
+                    TRACE_COMM_CONTROL_CREATION((double)(tmp_pd->sync_flag_addr/1000000.0), (double)((tmp_pd->sync_flag_addr+1)/1000000.0), (double)((tmp_pd->sync_flag_addr+1)/1000000.0), (void*)tmp_pd->local_addr); 
+                    TRACE_COMM_CONTROL_CREATION((double)(tmp_pd->sync_flag_value/1000000.0), (double)((tmp_pd->sync_flag_value+1)/1000000.0), (double)((tmp_pd->sync_flag_value+1)/1000000.0), (void*)tmp_pd->local_addr); 
+
                     START_EVENT();
                     CmiAssert(SIZEFIELD((void*)(tmp_pd->local_addr)) <= tmp_pd->length);
                     DecreaseMsgInRecv((void*)tmp_pd->local_addr);
@@ -2237,9 +2885,8 @@ static void PumpLocalRdmaTransactions()
                         buffered_recv_msg -= GetMempoolsize((void*)(tmp_pd->local_addr));
                     MACHSTATE5(8, "GO Recv done ack send from %d (%d,%d, %d) tag=%d\n", inst_id, buffered_send_msg, buffered_recv_msg, register_memory_size, msg_tag); 
 #endif
-#if CMK_SMP_TRACE_COMMTHREAD
                     TRACE_COMM_CREATION(CpvAccess(projTraceStart), (void*)tmp_pd->local_addr);
-#endif
+                    CMI_CHECK_CHECKSUM((void*)tmp_pd->local_addr, tmp_pd->length);
                     handleOneRecvedMsg(tmp_pd->length, (void*)tmp_pd->local_addr); 
                 }else if(msg_tag == BIG_MSG_TAG){
                     void *msg = (char*)tmp_pd->local_addr-(tmp_pd->cqwrite_value-1)*ONE_SEG;
@@ -2249,9 +2896,12 @@ static void PumpLocalRdmaTransactions()
 #if PRINT_SYH
                         printf("Pipeline msg done [%d]\n", myrank);
 #endif
-#if CMK_SMP_TRACE_COMMTHREAD
-                        TRACE_COMM_CREATION(CpvAccess(projTraceStart), msg);
+#if                 CMK_SMP_TRACE_COMMTHREAD
+                        if( tmp_pd->cqwrite_value == 1)
+                            TRACE_COMM_CONTROL_CREATION((double)(tmp_pd->sync_flag_addr/1000000.0), (double)((tmp_pd->sync_flag_addr+1)/1000000.0), (double)((tmp_pd->sync_flag_addr+2)/1000000.0), (void*)tmp_pd->local_addr); 
 #endif
+                        TRACE_COMM_CREATION(CpvAccess(projTraceStart), msg);
+                        CMI_CHECK_CHECKSUM(msg, tmp_pd->first_operand);
                         handleOneRecvedMsg(tmp_pd->first_operand, msg); 
                     }
                 }
@@ -2279,51 +2929,90 @@ static void  SendRdmaMsg()
     int len = PCQueueLength(sendRdmaBuf);
     for (i=0; i<len; i++)
     {
+#if CMI_EXERT_RDMA_CAP
+        if( RDMA_pending >= RDMA_cap) break;
+#endif
         CMI_PCQUEUEPOP_LOCK(sendRdmaBuf)
         ptr = (RDMA_REQUEST*)PCQueuePop(sendRdmaBuf);
         CMI_PCQUEUEPOP_UNLOCK(sendRdmaBuf)
         if (ptr == NULL) break;
 #else
     ptr = sendRdmaBuf;
-    while (ptr!=0)
+    while (ptr!=0 )
     {
+#if CMI_EXERT_RDMA_CAP
+         if( RDMA_pending >= RDMA_cap) break;
+#endif
 #endif 
         MACHSTATE4(8, "noempty-rdma  %d (%lld,%lld,%d) \n", ptr->destNode, buffered_send_msg, buffered_recv_msg, register_memory_size); 
         gni_post_descriptor_t *pd = ptr->pd;
-        status = GNI_RC_SUCCESS;
         
-        if(pd->cqwrite_value == 0)
-        {
-            if(IsMemHndlZero((GetMemHndl(pd->local_addr))))
-            {
-                msg = (void*)(pd->local_addr);
-                status = registerMemory(GetMempoolBlockPtr(msg), GetMempoolsize(msg), &(GetMemHndl(msg)));
-                if(status == GNI_RC_SUCCESS)
-                {
-                    pd->local_mem_hndl = GetMemHndl((void*)(pd->local_addr));
-                }
-            }else
-            {
-                pd->local_mem_hndl = GetMemHndl((void*)(pd->local_addr));
-            }
-            if(NoMsgInRecv( (void*)(pd->local_addr)))
-                register_size = GetMempoolsize((void*)(pd->local_addr));
-            else
-                register_size = 0;
-        }else if( IsMemHndlZero(pd->local_mem_hndl)) //big msg, can not fit into memory pool, or CmiDirect Msg (which is not from mempool)
-        {
-            status = registerMemory((void*)(pd->local_addr), pd->length, &(pd->local_mem_hndl)); 
+        msg = (void*)(pd->local_addr);
+        status = registerMessage(msg, pd->length, pd->cqwrite_value, &pd->local_mem_hndl);
+        register_size = 0;
+        if(pd->cqwrite_value == 0) {
+            if(NoMsgInRecv(msg))
+                register_size = GetMempoolsize(msg);
         }
+
         if(status == GNI_RC_SUCCESS)        //mem register good
         {
-            CMI_GNI_LOCK
+            int destNode = ptr->destNode;
+            CmiNodeLock lock = (pd->type == GNI_POST_RDMA_GET || pd->type == GNI_POST_RDMA_PUT) ? rdma_tx_cq_lock:default_tx_cq_lock;
+            CMI_GNI_LOCK(lock);
+#if REMOTE_EVENT
+            if( pd->cqwrite_value == 0) {
+                pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+                int sts = GNI_EpSetEventData(ep_hndl_array[destNode], destNode, ACK_EVENT(ptr->ack_index));
+                GNI_RC_CHECK("GNI_EpSetEventData", sts);
+            }
+#if CMK_PERSISTENT_COMM
+            else if (pd->cqwrite_value == PERSIST_SEQ) {
+                pd->cq_mode |= GNI_CQMODE_REMOTE_EVENT;
+                int sts = GNI_EpSetEventData(ep_hndl_array[destNode], destNode, PERSIST_EVENT(ptr->ack_index));
+                GNI_RC_CHECK("GNI_EpSetEventData", sts);
+            }
+#endif
+#endif
+#if CMK_WITH_STATS
+            RDMA_TRY_SEND(pd->type)
+#endif
+#if CMK_SMP_TRACE_COMMTHREAD
+//            int oldpe = -1;
+//            int oldeventid = -1;
+//            if(pd->type == GNI_POST_RDMA_PUT || pd->type == GNI_POST_FMA_PUT)
+//            { 
+//                TRACE_COMM_GET_MSGID((void*)pd->local_addr, &oldpe, &oldeventid);
+//                TRACE_COMM_SET_COMM_MSGID((void*)pd->local_addr);
+//            }
+              if(IS_PUT(pd->type) )
+              { 
+                  START_EVENT();
+                  TRACE_COMM_CREATION(CpvAccess(projTraceStart), (void*)pd->local_addr);
+              }
+#endif
+
             if(pd->type == GNI_POST_RDMA_GET || pd->type == GNI_POST_RDMA_PUT) 
-                status = GNI_PostRdma(ep_hndl_array[ptr->destNode], pd);
+            {
+                status = GNI_PostRdma(ep_hndl_array[destNode], pd);
+            }
             else
-                status = GNI_PostFma(ep_hndl_array[ptr->destNode],  pd);
-            CMI_GNI_UNLOCK
+            {
+                status = GNI_PostFma(ep_hndl_array[destNode],  pd);
+            }
+            CMI_GNI_UNLOCK(lock);
+            
+#if CMK_SMP_TRACE_COMMTHREAD
+//            if(pd->type == GNI_POST_RDMA_PUT || pd->type == GNI_POST_FMA_PUT)
+//            { 
+//                if (oldpe != -1)  TRACE_COMM_SET_MSGID((void*)pd->local_addr, oldpe, oldeventid);
+//            }
+#endif
             if(status == GNI_RC_SUCCESS)    //post good
             {
+#if CMI_EXERT_RDMA_CAP
+                RDMA_pending ++;
+#endif
 #if !CMK_SMP
                 tmp_ptr = ptr;
                 if(pre != 0) {
@@ -2337,11 +3026,21 @@ static void  SendRdmaMsg()
 #endif
                 if(pd->cqwrite_value == 0)
                 {
+#if CMK_SMP_TRACE_COMMTHREAD 
+                    pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+#endif
                     IncreaseMsgInRecv(((void*)(pd->local_addr)));
                 }
+#if  CMK_WITH_STATS
+                pd->sync_flag_value = 1000000 * CmiWallTimer(); //microsecond
+                RDMA_TRANS_INIT(pd->type, pd->sync_flag_addr/1000000.0)
+#endif
 #if MACHINE_DEBUG_LOG
                 buffered_recv_msg += register_size;
                 MACHSTATE(8, "GO request from buffered\n"); 
+#endif
+#if PRINT_SYH
+                printf("[%d] SendRdmaMsg: post succeed. seqno: %x\n", myrank, pd->cqwrite_value);
 #endif
             }else           // cannot post
             {
@@ -2350,6 +3049,9 @@ static void  SendRdmaMsg()
 #else
                 pre = ptr;
                 ptr = ptr->next;
+#endif
+#if PRINT_SYH
+                printf("[%d] SendRdmaMsg: post failed. seqno: %x dest: %d local mhdl: %lld %lld remote mhdl: %lld %lld connect: %d\n", myrank, pd->cqwrite_value, destNode, pd->local_mem_hndl.qword1, pd->local_mem_hndl.qword2, pd->remote_mem_hndl.qword1, pd->remote_mem_hndl.qword2, smsg_connected_flag[destNode]);
 #endif
                 break;
             }
@@ -2379,9 +3081,6 @@ static int SendBufferMsg(SMSG_QUEUE *queue)
     uint64_t            register_size;
     void                *register_addr;
     int                 index_previous = -1;
-#if CMI_EXERT_SEND_CAP
-    int                        sent_cnt = 0;
-#endif
 
 #if CMK_SMP
     int          index = 0;
@@ -2417,6 +3116,7 @@ static int SendBufferMsg(SMSG_QUEUE *queue)
 #else
     for(index =0; index<mysize; index++)
     {
+        //if (index == myrank) continue;
         PCQueue current_queue = queue->smsg_msglist_index[index].sendSmsgBuf;
         int i, len = PCQueueLength(current_queue);
 #endif
@@ -2440,12 +3140,17 @@ static int SendBufferMsg(SMSG_QUEUE *queue)
             status = GNI_RC_ERROR_RESOURCE;
             if (useDynamicSMSG && smsg_connected_flag[index] != 2) {   
                 /* connection not exists yet */
+#if CMK_SMP
+                  /* non-smp case, connect is issued in send_smsg_message */
+                if (smsg_connected_flag[index] == 0)
+                    connect_to(ptr->destNode); 
+#endif
             }
             else
             switch(ptr->tag)
             {
             case SMALL_DATA_TAG:
-                status = send_smsg_message(queue, ptr->destNode,  ptr->msg, ptr->size, ptr->tag, 1);  
+                status = send_smsg_message(queue, ptr->destNode,  ptr->msg, ptr->size, ptr->tag, 1, ptr);  
                 if(status == GNI_RC_SUCCESS)
                 {
                     CmiFree(ptr->msg);
@@ -2453,22 +3158,33 @@ static int SendBufferMsg(SMSG_QUEUE *queue)
                 break;
             case LMSG_INIT_TAG:
                 control_msg_tmp = (CONTROL_MSG*)ptr->msg;
-                status = send_large_messages(queue, ptr->destNode, control_msg_tmp, 1);
+                status = send_large_messages( queue, ptr->destNode, control_msg_tmp, 1, ptr);
                 break;
+#if !REMOTE_EVENT && !CQWRITE
             case ACK_TAG:
-                status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1);  
+                status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1, ptr);  
                 if(status == GNI_RC_SUCCESS) FreeAckMsg((ACK_MSG*)ptr->msg);
                 break;
+#endif
             case BIG_MSG_TAG:
-                status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1);  
+                status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1, ptr);  
+                if(status == GNI_RC_SUCCESS)
+                {
+                    FreeControlMsg((CONTROL_MSG*)ptr->msg);
+                }
+                break;
+#if CMK_PERSISTENT_COMM && !REMOTE_EVENT && !CQWRITE 
+            case PUT_DONE_TAG:
+                status = send_smsg_message(queue, ptr->destNode, ptr->msg, ptr->size, ptr->tag, 1, ptr);  
                 if(status == GNI_RC_SUCCESS)
                 {
                     FreeControlMsg((CONTROL_MSG*)ptr->msg);
                 }
                 break;
+#endif
 #if CMK_DIRECT
             case DIRECT_PUT_DONE_TAG:
-                status = send_smsg_message(queue, ptr->destNode, ptr->msg, sizeof(CMK_DIRECT_HEADER), ptr->tag, 1);  
+                status = send_smsg_message(queue, ptr->destNode, ptr->msg, sizeof(CMK_DIRECT_HEADER), ptr->tag, 1, ptr);  
                 if(status == GNI_RC_SUCCESS)
                 {
                     free((CMK_DIRECT_HEADER*)ptr->msg);
@@ -2482,6 +3198,10 @@ static int SendBufferMsg(SMSG_QUEUE *queue)
             }       // end switch
             if(status == GNI_RC_SUCCESS)
             {
+#if PRINT_SYH
+                buffered_smsg_counter--;
+                printf("[%d==>%d] buffered smsg sending done\n", myrank, ptr->destNode);
+#endif
 #if !CMK_SMP
                 tmp_ptr = ptr;
                 if(pre)
@@ -2494,15 +3214,6 @@ static int SendBufferMsg(SMSG_QUEUE *queue)
                 FreeMsgList(tmp_ptr);
 #else
                 FreeMsgList(ptr);
-#endif
-#if PRINT_SYH
-                buffered_smsg_counter--;
-                printf("[%d==>%d] buffered smsg sending done\n", myrank, ptr->destNode);
-#endif
-#if CMI_EXERT_SEND_CAP
-                sent_cnt++;
-                if(sent_cnt == SEND_CAP)
-                    break;
 #endif
             }else {
 #if CMK_SMP
@@ -2546,16 +3257,12 @@ static int SendBufferMsg(SMSG_QUEUE *queue)
         if(!PCQueueEmpty(current_queue) && current_list->pushed == 0)
         {
             current_list->pushed = 1;
-            PCQueuePush(nonEmptyQueues, current_list);
+            PCQueuePush(nonEmptyQueues, (char*)current_list);
         }
         CmiUnlock(current_list->lock); 
 #endif
 #endif
 
-#if CMI_EXERT_SEND_CAP
-       if(sent_cnt == SEND_CAP)
-               break;
-#endif
     }   // end pooling for all cores
     return done;
 }
@@ -2573,70 +3280,105 @@ void LrtsAdvanceCommunication(int whileidle)
 #if CMK_SMP_TRACE_COMMTHREAD
         startT = CmiWallTimer();
 #endif
-        PumpDatagramConnection();
+        STATS_PUMPDATAGRAMCONNECTION_TIME(PumpDatagramConnection());
 #if CMK_SMP_TRACE_COMMTHREAD
         endT = CmiWallTimer();
-        if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(10, startT, endT);
+        if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_SetupConnect, startT, endT);
 #endif
     }
 
 #if CMK_SMP_TRACE_COMMTHREAD
     startT = CmiWallTimer();
 #endif
-    PumpNetworkSmsg();
+    STATS_PUMPNETWORK_TIME(PumpNetworkSmsg());
     //MACHSTATE(8, "after PumpNetworkSmsg \n") ; 
 #if CMK_SMP_TRACE_COMMTHREAD
     endT = CmiWallTimer();
-    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(20, startT, endT);
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_PumpSmsg, startT, endT);
 #endif
 
 #if CMK_SMP_TRACE_COMMTHREAD
     startT = CmiWallTimer();
 #endif
-    PumpLocalRdmaTransactions();
-    //MACHSTATE(8, "after PumpLocalRdmaTransactions\n") ; 
+    PumpLocalTransactions(default_tx_cqh, default_tx_cq_lock);
+    //MACHSTATE(8, "after PumpLocalTransactions\n") ; 
 #if CMK_SMP_TRACE_COMMTHREAD
     endT = CmiWallTimer();
-    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(30, startT, endT);
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_PumpTransaction, startT, endT);
 #endif
-    /* Send buffered Message */
+
 #if CMK_SMP_TRACE_COMMTHREAD
     startT = CmiWallTimer();
 #endif
-#if CMK_USE_OOB
-    if (SendBufferMsg(&smsg_oob_queue) == 1)
+    STATS_PUMPLOCALTRANSACTIONS_RDMA_TIME(PumpLocalTransactions(rdma_tx_cqh,  rdma_tx_cq_lock));
+
+#if CQWRITE
+    PumpCqWriteTransactions();
 #endif
-    {
-#if PIGGYBACK_ACK
-    //if (count%10 == 0) SendBufferMsg(&smsg_ack_queue);
-    if (SendBufferMsg(&smsg_queue) == 1 || count++ % 10 == 0) {
-        STATS_ACK_TIME(SendBufferMsg(&smsg_ack_queue));
-    }
-#else
-    SendBufferMsg(&smsg_queue);
+
+#if REMOTE_EVENT
+    STATS_PUMPREMOTETRANSACTIONS_TIME(PumpRemoteTransactions());
 #endif
-    }
-    //MACHSTATE(8, "after SendBufferMsg\n") ; 
+
+    //MACHSTATE(8, "after PumpLocalTransactions\n") ; 
 #if CMK_SMP_TRACE_COMMTHREAD
     endT = CmiWallTimer();
-    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(40, startT, endT);
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_PumpRdmaTransaction, startT, endT);
 #endif
-
 #if CMK_SMP_TRACE_COMMTHREAD
     startT = CmiWallTimer();
 #endif
-    SendRdmaMsg();
+    STATS_SENDRDMAMSG_TIME(SendRdmaMsg());
     //MACHSTATE(8, "after SendRdmaMsg\n") ; 
 #if CMK_SMP_TRACE_COMMTHREAD
     endT = CmiWallTimer();
-    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(50, startT, endT);
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_SendFmaRdmaMsg, startT, endT);
 #endif
 
-#if CMK_SMP
+    /* Send buffered Message */
+#if CMK_SMP_TRACE_COMMTHREAD
+    startT = CmiWallTimer();
+#endif
+#if CMK_USE_OOB
+    if (SendBufferMsg(&smsg_oob_queue) == 1)
+#endif
+    {
+        STATS_SEND_SMSGS_TIME(SendBufferMsg(&smsg_queue));
+    }
+    //MACHSTATE(8, "after SendBufferMsg\n") ; 
+#if CMK_SMP_TRACE_COMMTHREAD
+    endT = CmiWallTimer();
+    if (endT-startT>=TRACE_THRESHOLD) traceUserBracketEvent(event_SendBufferSmsg, startT, endT);
+#endif
+
+#if CMK_SMP && ! LARGEPAGE
     if (_detected_hang)  ProcessDeadlock();
 #endif
 }
 
+static void set_smsg_max()
+{
+    char *env;
+
+    if(mysize <=512)
+    {
+        SMSG_MAX_MSG = 1024;
+    }else if (mysize <= 4096)
+    {
+        SMSG_MAX_MSG = 1024;
+    }else if (mysize <= 16384)
+    {
+        SMSG_MAX_MSG = 512;
+    }else {
+        SMSG_MAX_MSG = 256;
+    }
+
+    env = getenv("CHARM_UGNI_SMSG_MAX_SIZE");
+    if (env) SMSG_MAX_MSG = atoi(env);
+    CmiAssert(SMSG_MAX_MSG > 0);
+}    
+
 /* useDynamicSMSG */
 static void _init_dynamic_smsg()
 {
@@ -2652,18 +3394,8 @@ static void _init_dynamic_smsg()
         smsg_attr_vector_local[i] = NULL;
         smsg_attr_vector_remote[i] = NULL;
     }
-    if(mysize <=512)
-    {
-        SMSG_MAX_MSG = 4096;
-    }else if (mysize <= 4096)
-    {
-        SMSG_MAX_MSG = 4096/mysize * 1024;
-    }else if (mysize <= 16384)
-    {
-        SMSG_MAX_MSG = 512;
-    }else {
-        SMSG_MAX_MSG = 256;
-    }
+
+    set_smsg_max();
 
     send_smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
     send_smsg_attr.mbox_maxcredit = SMSG_MAX_CREDIT;
@@ -2685,7 +3417,7 @@ static void _init_dynamic_smsg()
         &(mailbox_list->mem_hndl));
     GNI_RC_CHECK("MEMORY registration for smsg", status);
 
-    status = GNI_EpCreate(nic_hndl, smsg_tx_cqh, &ep_hndl_unbound);
+    status = GNI_EpCreate(nic_hndl, default_tx_cqh, &ep_hndl_unbound);
     GNI_RC_CHECK("Unbound EP", status);
     
     alloc_smsg_attr(&send_smsg_attr);
@@ -2695,6 +3427,9 @@ static void _init_dynamic_smsg()
 
       /* always pre-connect to proc 0 */
     //if (myrank != 0) connect_to(0);
+
+    status = GNI_SmsgSetMaxRetrans(nic_hndl, 4096);
+    GNI_RC_CHECK("SmsgSetMaxRetrans Init", status);
 }
 
 static void _init_static_smsg()
@@ -2708,25 +3443,9 @@ static void _init_static_smsg()
     uint32_t              vmdh_index = -1;
     mdh_addr_t            base_infor;
     mdh_addr_t            *base_addr_vec;
-    char *env;
 
-    if(mysize <=512)
-    {
-        SMSG_MAX_MSG = 1024;
-    }else if (mysize <= 4096)
-    {
-        SMSG_MAX_MSG = 1024;
-    }else if (mysize <= 16384)
-    {
-        SMSG_MAX_MSG = 512;
-    }else {
-        SMSG_MAX_MSG = 256;
-    }
+    set_smsg_max();
     
-    env = getenv("CHARM_UGNI_SMSG_MAX_SIZE");
-    if (env) SMSG_MAX_MSG = atoi(env);
-    CmiAssert(SMSG_MAX_MSG > 0);
-
     smsg_attr = malloc(mysize * sizeof(gni_smsg_attr_t));
     
     smsg_attr[0].msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
@@ -2833,9 +3552,6 @@ static void _init_smsg()
     }
 
     _init_send_queue(&smsg_queue);
-#if PIGGYBACK_ACK
-    _init_send_queue(&smsg_ack_queue);
-#endif
 #if CMK_USE_OOB
     _init_send_queue(&smsg_oob_queue);
 #endif
@@ -2859,130 +3575,6 @@ static void _init_static_msgq()
 
 }
 
-#if CMK_SMP && STEAL_MEMPOOL
-void *steal_mempool_block(size_t *size, gni_mem_handle_t *mem_hndl)
-{
-    void *pool = NULL;
-    int i, k;
-    // check other ranks
-    for (k=0; k<CmiMyNodeSize()+1; k++) {
-        i = (CmiMyRank()+k)%CmiMyNodeSize();
-        if (i==CmiMyRank()) continue;
-        mempool_type *mptr = CpvAccessOther(mempool, i);
-        CmiLock(mptr->mempoolLock);
-        mempool_block *tail =  (mempool_block *)((char*)mptr + mptr->memblock_tail);
-        if ((char*)tail == (char*)mptr) {     /* this is the only memblock */
-            CmiUnlock(mptr->mempoolLock);
-            continue;
-        }
-        mempool_header *header = (mempool_header*)((char*)tail + sizeof(mempool_block));
-        if (header->size >= *size && header->size == tail->size - sizeof(mempool_block)) {
-            /* search in the free list */
-          mempool_header *free_header = mptr->freelist_head?(mempool_header*)((char*)mptr+mptr->freelist_head):NULL;
-          mempool_header *current = free_header;
-          while (current) {
-            if (current->next_free == (char*)header-(char*)mptr) break;
-            current = current->next_free?(mempool_header*)((char*)mptr + current->next_free):NULL;
-          }
-          if (current == NULL) {         /* not found in free list */
-            CmiUnlock(mptr->mempoolLock);
-            continue;
-          }
-printf("[%d:%d:%d] steal from %d tail: %p size: %d %d %d\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), i, tail, header->size, tail->size, sizeof(mempool_block));
-            /* search the previous memblock, and remove the tail */
-          mempool_block *ptr = (mempool_block *)mptr;
-          while (ptr) {
-            if (ptr->memblock_next == mptr->memblock_tail) break;
-            ptr = ptr->memblock_next?(mempool_block *)((char*)mptr + ptr->memblock_next):NULL;
-          }
-          CmiAssert(ptr!=NULL);
-          ptr->memblock_next = 0;
-          mptr->memblock_tail = (char*)ptr - (char*)mptr;
-
-            /* remove memblock from the free list */
-          current->next_free = header->next_free;
-          if (header == free_header) mptr->freelist_head = header->next_free;
-
-          CmiUnlock(mptr->mempoolLock);
-
-          pool = (void*)tail;
-          *mem_hndl = tail->mem_hndl;
-          *size = tail->size;
-          return pool;
-        }
-        CmiUnlock(mptr->mempoolLock);
-    }
-
-      /* steal failed, deregister and free memblock now */
-    int freed = 0;
-    for (k=0; k<CmiMyNodeSize()+1; k++) {
-        i = (CmiMyRank()+k)%CmiMyNodeSize();
-        mempool_type *mptr = CpvAccessOther(mempool, i);
-        if (i!=CmiMyRank()) CmiLock(mptr->mempoolLock);
-
-        mempool_block *mempools_head = &(mptr->mempools_head);
-        mempool_block *current = mempools_head;
-        mempool_block *prev = NULL;
-
-        while (current) {
-          int isfree = 0;
-          mempool_header *free_header = mptr->freelist_head?(mempool_header*)((char*)mptr+mptr->freelist_head):NULL;
-printf("[%d:%d:%d] checking rank: %d ptr: %p size: %d wanted: %d\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), i, current, current->size, *size);
-          mempool_header *cur = free_header;
-          mempool_header *header;
-          if (current != mempools_head) {
-            header = (mempool_header*)((char*)current + sizeof(mempool_block));
-             /* search in free list */
-            if (header->size == current->size - sizeof(mempool_block)) {
-              cur = free_header;
-              while (cur) {
-                if (cur->next_free == (char*)header-(char*)mptr) break;
-                cur = cur->next_free?(mempool_header*)((char*)mptr + cur->next_free):NULL;
-              }
-              if (cur != NULL) isfree = 1;
-            }
-          }
-          if (isfree) {
-              /* remove from free list */
-            cur->next_free = header->next_free;
-            if (header == free_header) mptr->freelist_head = header->next_free;
-             // deregister
-            gni_return_t status = MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &current->mem_hndl, &omdh,0)
-            GNI_RC_CHECK("steal Mempool de-register", status);
-            mempool_block *ptr = current;
-            current = current->memblock_next?(mempool_block *)((char*)mptr+current->memblock_next):NULL;
-            prev->memblock_next = current?(char*)current - (char*)mptr:0;
-printf("[%d:%d:%d] free rank: %d ptr: %p size: %d wanted: %d\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), i, ptr, ptr->size, *size);
-            freed += ptr->size;
-            free(ptr);
-             // try now
-            if (freed > *size) {
-              if (pool == NULL) {
-                int ret = posix_memalign(&pool, ALIGNBUF, *size);
-                CmiAssert(ret == 0);
-              }
-              MEMORY_REGISTER(onesided_hnd, nic_hndl, pool, *size,  mem_hndl, &omdh, status)
-              if (status == GNI_RC_SUCCESS) {
-                if (i!=CmiMyRank()) CmiUnlock(mptr->mempoolLock);
-printf("[%d:%d:%d] GOT IT rank: %d wanted: %d\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), i, *size);
-                return pool;
-              }
-printf("[%d:%d:%d] TRIED but fails: %d wanted: %d %d\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), i, *size, status);
-            }
-          }
-          else {
-             prev = current;
-             current = current->memblock_next?(mempool_block *)((char*)mptr+current->memblock_next):NULL;
-          }
-        }
-
-        if (i!=CmiMyRank()) CmiUnlock(mptr->mempoolLock);
-    }
-      /* still no luck registering pool */
-    if (pool) free(pool);
-    return NULL;
-}
-#endif
 
 static CmiUInt8 total_mempool_size = 0;
 static CmiUInt8 total_mempool_calls = 0;
@@ -3030,7 +3622,7 @@ void *alloc_mempool_block(size_t *size, gni_mem_handle_t *mem_hndl, int expand_f
 #if LARGEPAGE
     CmiMemLock();
     register_count++;
-    MEMORY_REGISTER(onesided_hnd, nic_hndl, pool, *size, mem_hndl, &omdh, status);
+    MEMORY_REGISTER(onesided_hnd, nic_hndl, pool, *size, mem_hndl, &omdh, rdma_rx_cqh, status);
     CmiMemUnlock();
     if(status != GNI_RC_SUCCESS) {
         printf("[%d, %d] memory reigstration %f G (%lld) ask for %lld\n", myrank, CmiMyRank(), register_memory_size/(1024*1024.0*1024),register_count, *size);
@@ -3106,6 +3698,12 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
     Cmi_smp_mode_setting = COMM_WORK_THREADS_SEND_RECV;
 #endif
 
+#if CMI_EXERT_SEND_CAP
+    CmiGetArgInt(*argv,"+useSendLargeCap", &SEND_large_cap);
+#endif
+
+    CmiGetArgInt(*argv,"+useRecvQueue", &REMOTE_QUEUE_ENTRIES);
+    
     env = getenv("CHARM_UGNI_REMOTE_QUEUE_SIZE");
     if (env) REMOTE_QUEUE_ENTRIES = atoi(env);
     CmiGetArgInt(*argv,"+useRecvQueue", &REMOTE_QUEUE_ENTRIES);
@@ -3155,18 +3753,18 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
     allgather(&local_addr, MPID_UGNI_AllAddr, sizeof(unsigned int));
     /* create the local completion queue */
     /* the third parameter : The number of events the NIC allows before generating an interrupt. Setting this parameter to zero results in interrupt delivery with every event. When using this parameter, the mode parameter must be set to GNI_CQ_BLOCKING*/
-    status = GNI_CqCreate(nic_hndl, LOCAL_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &smsg_tx_cqh);
+    status = GNI_CqCreate(nic_hndl, LOCAL_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &default_tx_cqh);
     GNI_RC_CHECK("GNI_CqCreate (tx)", status);
     
-    status = GNI_CqCreate(nic_hndl, LOCAL_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &post_tx_cqh);
-    GNI_RC_CHECK("GNI_CqCreate post (tx)", status);
+    status = GNI_CqCreate(nic_hndl, LOCAL_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &rdma_tx_cqh);
+    GNI_RC_CHECK("GNI_CqCreate RDMA (tx)", status);
     /* create the destination completion queue for receiving micro-messages, make this queue considerably larger than the number of transfers */
 
     status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &smsg_rx_cqh);
     GNI_RC_CHECK("Create CQ (rx)", status);
     
-    //status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &post_rx_cqh);
-    //GNI_RC_CHECK("Create Post CQ (rx)", status);
+    status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &rdma_rx_cqh);
+    GNI_RC_CHECK("Create Post CQ (rx)", status);
     
     //status = GNI_CqCreate(nic_hndl, REMOTE_QUEUE_ENTRIES, 0, GNI_CQ_NOBLOCK, NULL, NULL, &rdma_cqh);
     //GNI_RC_CHECK("Create BTE CQ", status);
@@ -3175,12 +3773,16 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
     ep_hndl_array = (gni_ep_handle_t*)malloc(mysize * sizeof(gni_ep_handle_t));
     _MEMCHECK(ep_hndl_array);
 #if MULTI_THREAD_SEND 
-    tx_cq_lock  = CmiCreateLock();
-    rx_cq_lock  = CmiCreateLock();
+    rx_cq_lock = global_gni_lock = default_tx_cq_lock = smsg_mailbox_lock = CmiCreateLock();
+    //default_tx_cq_lock = CmiCreateLock();
+    rdma_tx_cq_lock = CmiCreateLock();
+    smsg_rx_cq_lock = CmiCreateLock();
+    //global_gni_lock  = CmiCreateLock();
+    //rx_cq_lock  = CmiCreateLock();
 #endif
     for (i=0; i<mysize; i++) {
         if(i == myrank) continue;
-        status = GNI_EpCreate(nic_hndl, smsg_tx_cqh, &ep_hndl_array[i]);
+        status = GNI_EpCreate(nic_hndl, default_tx_cqh, &ep_hndl_array[i]);
         GNI_RC_CHECK("GNI_EpCreate ", status);   
         remote_addr = MPID_UGNI_AllAddr[i];
         status = GNI_EpBind(ep_hndl_array[i], remote_addr, i);
@@ -3265,6 +3867,14 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
     if (env) _checkProgress = 0;
     if (mysize == 1) _checkProgress = 0;
 
+#if CMI_EXERT_RDMA_CAP
+    env = getenv("CHARM_UGNI_RDMA_MAX");
+    if (env)  {
+        RDMA_pending = atoi(env);
+        if (myrank == 0)
+            printf("Charm++> Max pending RDMA set to: %d\n", RDMA_pending);
+    }
+#endif
     
     /*
     env = getenv("HUGETLB_DEFAULT_PAGE_SIZE");
@@ -3283,18 +3893,29 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
     }
 #endif
 
+      /* stats related arguments */
+#if CMK_WITH_STATS
+    CmiGetArgStringDesc(*argv,"+gni_stats_root",&counters_dirname,"counter directory name, default counters");
+
     print_stats = CmiGetArgFlag(*argv, "+print_stats");
+    
+    stats_off = CmiGetArgFlag(*argv, "+stats_off");
+
+    init_comm_stats();
+#endif
 
     /* init DMA buffer for medium message */
 
     //_init_DMA_buffer();
     
     free(MPID_UGNI_AllAddr);
+
 #if CMK_SMP
     sendRdmaBuf = PCQueueCreate();
 #else
     sendRdmaBuf = 0;
 #endif
+
 #if MACHINE_DEBUG_LOG
     char ln[200];
     sprintf(ln,"debugLog.%d",myrank);
@@ -3304,8 +3925,14 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
 //    NTK_Init();
 //    ntk_return_t sts = NTK_System_GetSmpdCount(&_smpd_count);
 
-#if CMK_WITH_STATS
-    init_comm_stats();
+#if  REMOTE_EVENT
+    SHIFT = 1;
+    while (1<<SHIFT < mysize) SHIFT++;
+    CmiAssert(SHIFT < 31);
+    IndexPool_init(&ackPool);
+#if CMK_PERSISTENT_COMM
+    IndexPool_init(&persistPool);
+#endif
 #endif
 }
 
@@ -3350,7 +3977,10 @@ void* LrtsAlloc(int n_bytes, int header)
 
 void  LrtsFree(void *msg)
 {
-    int size = SIZEFIELD((char*)msg+sizeof(CmiChunkHeader));
+    CmiUInt4 size = SIZEFIELD((char*)msg+sizeof(CmiChunkHeader));
+#if CMK_PERSISTENT_COMM
+    if (IS_PERSISTENT_MEMORY(msg)) return;
+#endif
     if (size <= SMSG_MAX_MSG)
         free(msg);
     else {
@@ -3381,6 +4011,9 @@ void  LrtsFree(void *msg)
 void LrtsExit()
 {
 #if CMK_WITH_STATS
+#if CMK_SMP
+    if(CmiMyRank() == CmiMyNodeSize())
+#endif
     if (print_stats) print_comm_stats();
 #endif
     /* free memory ? */
@@ -3400,22 +4033,23 @@ void LrtsDrainResources()
            !SendBufferMsg(&smsg_oob_queue) ||
 #endif
            !SendBufferMsg(&smsg_queue) 
-#if PIGGYBACK_ACK
-        || !SendBufferMsg(&smsg_ack_queue)
-#endif
           )
     {
         if (useDynamicSMSG)
             PumpDatagramConnection();
         PumpNetworkSmsg();
-        PumpLocalRdmaTransactions();
+        PumpLocalTransactions(default_tx_cqh, default_tx_cq_lock);
+        PumpLocalTransactions(rdma_tx_cqh, rdma_tx_cq_lock);
+#if REMOTE_EVENT
+        PumpRemoteTransactions();
+#endif
         SendRdmaMsg();
     }
     PMI_Barrier();
 }
 
 void LrtsAbort(const char *message) {
-    printf("CmiAbort is calling on PE:%d\n", myrank);
+    fprintf(stderr, "[%d] CmiAbort: %s\n", myrank, message);
     CmiPrintStackTrace(0);
     PMI_Abort(-1, message);
 }
index dea2c659e9320861029e536f6c12338775eb16a6..a2b1c8fdda43b2db3b8ec43571262ad0ecafe9aa 100644 (file)
@@ -1,4 +1,5 @@
-XLC_TYPICAL_PRE=/opt/ibmcmp/vacpp/bg/11.1/
+XLC_TYPICAL_PRE=/soft/compilers/ibmcmp-feb2012/vacpp/bg/12.1
+XLF_TYPICAL_PRE=/soft/compilers/ibmcmp-feb2012/xlf/bg/14.1
 XLC_PRE=$XLC_TYPICAL_PRE
 
 XLC_TYPICAL_POST=bin/bg
@@ -10,14 +11,14 @@ then
   XLC_PRE=$BGQ_XLC_PRE
 fi
 
-XLC_F=$XLC_PRE/xlf/bg/11.1/
+XLC_F=$XLF_TYPICAL_PRE
 CMK_CC="$XLC_PRE/${XLC_POST}xlc -qcpluscmt -qhalt=e $BGQ_INC"
 CMK_CXX="$XLC_PRE/${XLC_POST}xlC -qhalt=e $BGQ_INC"
 CMK_LD="$CMK_CC"
 CMK_LDXX="$CMK_CXX"
 CMK_CF77="$XLC_F/${XLC_POST}xlf "
 CMK_CF90="$XLC_F/${XLC_POST}xlf90  -qsuffix=f=f90" 
-CMK_CF90_FIXED="$XLC_PRE/xlf/8.1/${XLC_POST}xlf90 " 
+CMK_CF90_FIXED="$XLC_F/${XLC_POST}xlf90 " 
 CMK_C_OPTIMIZE='-O3 -Q'
 CMK_CXX_OPTIMIZE='-O3 -Q'
 CMK_AR='ar cq'
index 7a484c143685896d39ef092b21afba13c0424678..f45eef114770be579577eb84b0397e18c7b765d4 100644 (file)
@@ -15,7 +15,7 @@ fi
 BGQ_BIN=$BGQ_FLOOR/gnu-linux/bin
 BGQ_INC="-I$BGQ_INSTALL/comm/gcc/include -I$BGQ_INSTALL/spi/include -I$BGQ_INSTALL -I$BGQ_INSTALL/spi/include/kernel/cnk/"
 
-BGQ_LIB="-L$BGQ_INSTALL/comm/xl.fast/lib -lmpich -lopa -lmpl -ldl -L$BGQ_INSTALL/comm/sys-fast/lib -lpami -L$BGQ_INSTALL/spi/lib -lSPI -lSPI_cnk -lpthread -lrt"
+BGQ_LIB="-L$BGQ_INSTALL/comm/xl/lib -lmpich -lopa -lmpl -ldl -L$BGQ_INSTALL/comm/sys-fast/lib -lpami -L$BGQ_INSTALL/spi/lib -lSPI -lSPI_cnk -lpthread -lrt"
 
 # test if compiler binary present
 if test ! -x $BGQ_BIN/powerpc64-bgq-linux-g++
diff --git a/src/arch/mpi/conv-mach-causalft.h b/src/arch/mpi/conv-mach-causalft.h
new file mode 100644 (file)
index 0000000..ca28aeb
--- /dev/null
@@ -0,0 +1,3 @@
+#define __FAULT__       1
+#define _FAULT_CAUSAL_      1
+#define CMK_CHARE_USE_PTR   1
index eaa3022bc2e08ab5cb480d2a447525b4c776aa60..6d3f524b353617c3371a7f4005c2815c65099c70 100644 (file)
@@ -1176,11 +1176,13 @@ void MachineExitForMPI() {
 #endif
 #endif
 
+   if(!CharmLibInterOperate) {
 #if ! CMK_AUTOBUILD
-    signal(SIGINT, signal_int);
-    MPI_Finalize();
+      signal(SIGINT, signal_int);
+      MPI_Finalize();
 #endif
-    exit(0);
+      exit(0);
+    }
 }
 
 static int machine_exit_idx;
@@ -1293,33 +1295,40 @@ static void MachineInitForMPI(int *argc, char ***argv, int *numNodes, int *myNod
 #endif
     }
 
+    if(!CharmLibInterOperate) {
 #if CMK_MPI_INIT_THREAD
 #if CMK_SMP
     if (Cmi_smp_mode_setting == COMM_THREAD_SEND_RECV)
-      thread_level = MPI_THREAD_FUNNELED;
-    else
-      thread_level = MPI_THREAD_MULTIPLE;
+        thread_level = MPI_THREAD_FUNNELED;
+      else
+        thread_level = MPI_THREAD_MULTIPLE;
 #else
-    thread_level = MPI_THREAD_SINGLE;
+      thread_level = MPI_THREAD_SINGLE;
 #endif
-    MPI_Init_thread(argc, argv, thread_level, &provided);
-    _thread_provided = provided;
+      MPI_Init_thread(argc, argv, thread_level, &provided);
+      _thread_provided = provided;
 #else
-    MPI_Init(argc, argv);
-    thread_level = 0;
-    _thread_provided = -1;
+      MPI_Init(argc, argv);
+      thread_level = 0;
+      _thread_provided = -1;
 #endif
+    }
+
     largc = *argc;
     largv = *argv;
-    MPI_Comm_size(MPI_COMM_WORLD, numNodes);
-    MPI_Comm_rank(MPI_COMM_WORLD, myNodeID);
+    if(!CharmLibInterOperate) {
+      MPI_Comm_size(MPI_COMM_WORLD, numNodes);
+      MPI_Comm_rank(MPI_COMM_WORLD, myNodeID);
+    }
 
     myNID = *myNodeID;
 
     MPI_Get_version(&ver, &subver);
-    if (myNID == 0) {
+    if(!CharmLibInterOperate) {
+      if (myNID == 0) {
         printf("Charm++> Running on MPI version: %d.%d\n", ver, subver);
         printf("Charm++> level of thread support used: %s (desired: %s)\n", thread_level_tostring(_thread_provided), thread_level_tostring(thread_level));
+      }
     }
 
 #if CMK_SMP
index 448dafbfc4ae78402c13a013df025fc8719607fb..fc3b576ae6d200752a4a8447aa97d7b3fed59474 100644 (file)
@@ -15,6 +15,7 @@ CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/"
 CMK_LIBS="-lckqt"
 CMK_RANLIB="ranlib"
 CMK_CC64=true
+CMK_WARNINGS_ARE_ERRORS="-Werror"
 
 # native compiler for compiling charmxi, etc
 CMK_NATIVE_CC="$CMK_CC $CMK_AMD64 -fPIC $CMK_DEFS "
index 66d3cc73b9f9f97686b8e489a12d532c23fac3c1..e493f46c9f5b09be0b7a97dde223af2bdad6a65d 100644 (file)
@@ -1,4 +1,4 @@
-CMK_DEFS=' -I. '
+CMK_DEFS=' -I.  -DCMK_FIND_FIRST_OF_PREDICATE=1 '
 CMK_CPP_CHARM="/usr/ccs/lib/cpp $CMK_DEFS"
 CMK_CPP_C="gcc -E $CMK_DEFS"
 CMK_CC="cc -m64 $CMK_DEFS"
index 0b04fc1eb62964a7e434fcbb25fb3097cc5672ff..28636fa05dd3479989b2530374ec9b7690149e9b 100644 (file)
@@ -4674,7 +4674,7 @@ void restart_node(int crashed_node){
        while(arg_argv[i]!= NULL){
                i++;
        }
-       restart_argv = (char **)malloc(sizeof(char *)*(i+3));
+       restart_argv = (char **)malloc(sizeof(char *)*(i+4));
        i=0;
        while(arg_argv[i]!= NULL){
                restart_argv[i] = arg_argv[i];
@@ -4683,7 +4683,8 @@ void restart_node(int crashed_node){
        restart_argv[i] = "+restartaftercrash";
         sprintf(phase_str,"%d", ++current_restart_phase);
        restart_argv[i+1]=phase_str;
-       restart_argv[i+2]=NULL;
+       restart_argv[i+2] = "+restartisomalloc";
+       restart_argv[i+3]=NULL;
 
        rsh_script(f,pe,crashed_node,restart_argv,1);
        fclose(f);
diff --git a/src/arch/pami-bluegeneq/cc-xlc.sh b/src/arch/pami-bluegeneq/cc-xlc.sh
new file mode 100644 (file)
index 0000000..c99c596
--- /dev/null
@@ -0,0 +1,32 @@
+XLC_TYPICAL_PRE=/soft/compilers/ibmcmp-feb2012/
+XLC_PRE=$XLC_TYPICAL_PRE
+
+XLC_TYPICAL_POST_BG=vacpp/bg/12.1/bin/bg
+XLC_TYPICAL_POST=vacpp/bg/12.1/bin
+XLC_POST=$XLC_TYPICAL_POST_BG
+
+# if no floor set, use typical floor path
+if test -n "$BGQ_XLC_PRE"
+then
+  XLC_PRE=$BGQ_XLC_PRE
+fi
+
+XLC_F=$XLC_PRE/xlf/bg/14.1/bin
+CMK_CC="$XLC_PRE/${XLC_POST}xlc_r -qcpluscmt -qhalt=e $BGQ_INC"
+CMK_CXX="$XLC_PRE/${XLC_POST}xlC_r -qhalt=e $BGQ_INC"
+CMK_LD="$CMK_CC"
+CMK_LDXX="$CMK_CXX"
+CMK_CF77="$XLC_F/bgxlf "
+CMK_CF90="$XLC_F/bgxlf90  -qsuffix=f=f90" 
+CMK_CF90_FIXED="$XLC_F/bgxlf90 " 
+CMK_C_OPTIMIZE='-O3 -Q'
+CMK_CXX_OPTIMIZE='-O3 -Q'
+CMK_AR='ar cq'
+CMK_NM='nm '
+CMK_QT="aix"
+#CMK_NATIVE_CC="/opt/ibmcmp/vacpp/bg/9.0/bin/xlc"
+#CMK_NATIVE_CXX="/opt/ibmcmp/vacpp/bg/9.0/bin/xlC"
+CMK_NATIVE_LD="$CMK_NATIVE_CC"
+CMK_NATIVE_LDXX="$CMK_NATIVE_CXX"
+CMK_RANLIB="ranlib"
+CMK_F90LIBS="-L$XLC_F/lib -lxlf90 -lxlopt -lxl -lxlfmath"
diff --git a/src/arch/pami-bluegeneq/conv-mach-smp.h b/src/arch/pami-bluegeneq/conv-mach-smp.h
new file mode 100644 (file)
index 0000000..2649894
--- /dev/null
@@ -0,0 +1,25 @@
+
+#define CMK_SMP                                                   1
+
+#undef CMK_NODE_QUEUE_AVAILABLE
+#define CMK_NODE_QUEUE_AVAILABLE                           1
+
+#undef CMK_SHARED_VARS_UNAVAILABLE
+#undef CMK_SHARED_VARS_POSIX_THREADS_SMP
+#define CMK_SHARED_VARS_UNAVAILABLE                        0
+#define CMK_SHARED_VARS_POSIX_THREADS_SMP                  1
+
+/* Right now only comm thread (no multicore) and tls thread version with gcc works on Blue Gene*/
+#define CMK_MULTICORE                                      0
+
+#ifdef __GNUC__
+#define CMK_NOT_USE_TLS_THREAD                             0
+#else
+#define CMK_NOT_USE_TLS_THREAD                             1
+#endif
+
+#define CMK_PCQUEUE_LOCK                                   1
+
+#define CMK_SMP_NO_COMMTHD                                 1
+
+#define CMK_FAKE_SCHED_YIELD                               1
diff --git a/src/arch/pami-bluegeneq/conv-mach-smp.sh b/src/arch/pami-bluegeneq/conv-mach-smp.sh
new file mode 100644 (file)
index 0000000..4a1f468
--- /dev/null
@@ -0,0 +1,3 @@
+
+#conv-mach-smp.sh:CMK_LD=/opt/ibmcmp/vacpp/bg/12.1/bin/bgxlc_r
+#conv-mach-smp.sh:CMK_LDXX=/opt/ibmcmp/vacpp/bg/12.1/bin/bgxlC_r 
diff --git a/src/arch/pami-bluegeneq/conv-mach.h b/src/arch/pami-bluegeneq/conv-mach.h
new file mode 100644 (file)
index 0000000..074a956
--- /dev/null
@@ -0,0 +1,76 @@
+#ifndef _CONV_MACH_H
+#define _CONV_MACH_H
+
+#define CMK_NO_OUTSTANDING_SENDS                          0
+
+#define CMK_64BIT                                          1
+
+//#define CMK_MEMORY_PREALLOCATE_HACK                        1
+
+//#define CMK_CONVERSE_MPI                                   1
+
+#define CMK_NO_SOCKETS                                    1
+
+#define CMK_DEFAULT_MAIN_USES_COMMON_CODE                  1
+
+#define CMK_GETPAGESIZE_AVAILABLE                          1
+
+#define CMK_IS_HETERO                                      0
+
+#define CMK_MALLOC_USE_GNU_MALLOC                          0
+#define CMK_MALLOC_USE_OS_BUILTIN                          1
+
+#define CMK_MEMORY_PAGESIZE                                8192
+#define CMK_MEMORY_PROTECTABLE                             1
+
+#define CMK_NODE_QUEUE_AVAILABLE                           0
+
+#define CMK_SHARED_VARS_UNAVAILABLE                        1
+#define CMK_SHARED_VARS_UNIPROCESSOR                       0
+
+#define CMK_SIGNAL_NOT_NEEDED                              0
+#define CMK_SIGNAL_USE_SIGACTION                           0
+#define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART              1
+
+#define CMK_SYNCHRONIZE_ON_TCP_CLOSE                       0
+
+#define CMK_THREADS_USE_CONTEXT                            0
+#define CMK_THREADS_USE_JCONTEXT                           1
+#define CMK_THREADS_USE_PTHREADS                           0
+#define CMK_THREADS_ARE_WIN32_FIBERS                       0
+
+#define CMK_THREADS_REQUIRE_NO_CPV                         0
+
+#define CMK_TIMER_USE_GETRUSAGE                            0
+#define CMK_TIMER_USE_SPECIAL                              0
+#define CMK_TIMER_USE_TIMES                                0
+// This needs to be compiled with gcc only
+#define CMK_TIMER_USE_BLUEGENEQ                                   1
+
+#define CMK_TYPEDEF_INT2 short
+#define CMK_TYPEDEF_INT4 int
+#define CMK_TYPEDEF_INT8 long long
+#define CMK_TYPEDEF_UINT2 unsigned short
+#define CMK_TYPEDEF_UINT4 unsigned int
+#define CMK_TYPEDEF_UINT8 unsigned long long
+#define CMK_TYPEDEF_FLOAT4 float
+#define CMK_TYPEDEF_FLOAT8 double
+
+#define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT                   1
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP                     0
+
+
+#define CMK_WEB_MODE                                       1
+#define CMK_DEBUG_MODE                                     0
+
+#define CMK_LBDB_ON                                       1
+
+#undef CMK_CCS_AVAILABLE
+#define CMK_CCS_AVAILABLE                                 0
+
+#define CMK_BLUEGENEQ                                      1
+
+#define CMK_NO_ISO_MALLOC                                  1
+
+#endif
+
diff --git a/src/arch/pami-bluegeneq/conv-mach.sh b/src/arch/pami-bluegeneq/conv-mach.sh
new file mode 100644 (file)
index 0000000..ba10ce6
--- /dev/null
@@ -0,0 +1,61 @@
+BGQ_TYPICAL_FLOOR=/bgsys/drivers/ppcfloor
+
+# if no floor set, use typical floor path
+if test -z "$BGQ_FLOOR"
+then
+  BGQ_FLOOR=$BGQ_TYPICAL_FLOOR
+fi
+
+# if no install path (for experimental) set, use floor
+if test -z "$BGQ_INSTALL"
+then
+  BGQ_INSTALL=$BGQ_TYPICAL_FLOOR
+fi
+
+BGQ_BIN=$BGQ_FLOOR/gnu-linux/bin
+BGQ_INC="-I$BGQ_INSTALL/comm/sys/include -I$BGQ_INSTALL/spi/include -I$BGQ_INSTALL -I$BGQ_INSTALL/spi/include/kernel/cnk/"
+
+BGQ_LIB="-L$BGQ_INSTALL/comm/sys/lib -lpami -L$BGQ_INSTALL/spi/lib -lSPI -lSPI_cnk -lpthread -lrt" 
+#"-pg -L/bghome/boger/sandbox/src-110606/bgq/work/gnu-linux/powerpc64-bgq-linux/lib -lc"
+
+# test if compiler binary present
+if test ! -x $BGQ_BIN/powerpc64-bgq-linux-g++
+then
+ echo "ERROR: Invalid BGQ_INSTALL or BGQ_FLOOR, C/C++ compiler missing"
+ exit 1
+fi
+
+OPTS_CPP="$OPTS_CPP"
+GCC_OPTS="-Wno-deprecated $BGQ_INC"
+OPTS_LD="$OPTS_LD"
+
+CMK_CPP_CHARM="$BGQ_BIN/powerpc64-bgq-linux-cpp -P"
+CMK_CPP_C="$BGQ_BIN/powerpc64-bgq-linux-cpp -E "
+CMK_CXX="$BGQ_BIN/powerpc64-bgq-linux-g++ $GCC_OPTS "
+CMK_GCXX="$BGQ_BIN/powerpc64-bgq-linux-g++ $GCC_OPTS "
+CMK_CC="$BGQ_BIN/powerpc64-bgq-linux-gcc $GCC_OPTS "
+CMK_CXXPP="$BGQ_BIN/powerpc64-bgq-linux-g++ -E "
+CMK_CF77="$BGQ_BIN/powerpc64-bgq-linux-gfortran "
+CMK_CF90='f90'
+CMK_RANLIB="$BGQ_BIN/powerpc64-bgq-linux-ranlib "
+CMK_AR="$BGQ_BIN/powerpc64-bgq-linux-ar q "
+CMK_SYSLIBS="$BGQ_LIB"
+CMK_LIBS='-lckqt'
+CMK_LD="$CMK_CC"
+CMK_LDXX="$CMK_CXX"
+CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/"
+#CMK_SEQ_LIBS=''
+#CMK_SEQ_CC="$BGQ_BIN/powerpc64-bgq-linux-gcc -Wno-deprecated "
+#CMK_SEQ_LD="$CMK_SEQ_CC"
+#CMK_SEQ_CXX="$BGQ_BIN/powerpc64-bgq-linux-g++ -Wno-deprecated "
+#CMK_SEQ_LDXX="$CMK_SEQ_CXX"
+CMK_NATIVE_CC='gcc '
+CMK_NATIVE_LD='gcc '
+CMK_NATIVE_CXX='g++ -Wno-deprecated '
+CMK_NATIVE_LDXX='g++'
+CMK_F90LIBS='-lf90math -lfio -lU77 -lf77math '
+CMK_MOD_NAME_ALLCAPS=1
+CMK_MOD_EXT="mod"
+CMK_F90_USE_MODDIR=1
+CMK_F90_MODINC="-p"
+CMK_QT="generic64"
diff --git a/src/arch/pami/Makefile.machine b/src/arch/pami/Makefile.machine
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/arch/pami/conv-common.h b/src/arch/pami/conv-common.h
new file mode 100644 (file)
index 0000000..a3686b4
--- /dev/null
@@ -0,0 +1,36 @@
+
+#define CMK_CMIDELIVERS_USE_COMMON_CODE                    1
+
+#define CMK_CMIPRINTF_IS_A_BUILTIN                         0
+
+#define CMK_HANDLE_SIGUSR                                  1
+
+#define CMK_MSG_HEADER_EXT_    CmiUInt2 rank, hdl,xhdl,info, stratid; unsigned char cksum, magic; int root, size; CmiUInt2 redID, padding; 
+#define CMK_MSG_HEADER_BASIC  CMK_MSG_HEADER_EXT
+#define CMK_MSG_HEADER_EXT    { CMK_MSG_HEADER_EXT_ }
+#define CMK_MSG_HEADER_BIGSIM_    { CMK_MSG_HEADER_EXT_ CMK_BIGSIM_FIELDS }
+
+#define CMK_MULTICAST_GROUP_TYPE                struct { unsigned pe, id; }
+#define CMK_MULTICAST_DEF_USE_COMMON_CODE                  1
+#define CMK_MULTICAST_LIST_USE_COMMON_CODE                 0
+#define CMK_MULTICAST_GROUP_USE_COMMON_CODE                1
+
+#define CMK_RSH_IS_A_COMMAND                               0
+#define CMK_RSH_NOT_NEEDED                                 1
+#define CMK_RSH_USE_REMSH                                  0
+
+#define CMK_SPANTREE_MAXSPAN                               4
+#define CMK_SPANTREE_USE_COMMON_CODE                       1
+
+#define CMK_VECTOR_SEND_USES_COMMON_CODE                   1
+
+#define CMK_CCS_AVAILABLE                                  1
+
+#define NODE_0_IS_CONVHOST                                 1
+
+//#define CMK_IMMEDIATE_MSG                               1
+#define CMK_MACHINE_PROGRESS_DEFINED                       0
+
+//#define CMI_DIRECT_MANY_TO_MANY_DEFINED                    0
+
+#define CMK_PERSISTENT_COMM                                0
diff --git a/src/arch/pami/conv-common.sh b/src/arch/pami/conv-common.sh
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/arch/pami/machine.c b/src/arch/pami/machine.c
new file mode 100644 (file)
index 0000000..23c64aa
--- /dev/null
@@ -0,0 +1,1465 @@
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <math.h>
+#include <string.h>
+#include "machine.h"
+#include "converse.h"
+#include "pcqueue.h"
+#include "assert.h"
+#include "malloc.h"
+
+#include <hwi/include/bqc/A2_inlines.h>
+#include "pami.h"
+#include "pami_sys.h"
+
+char *ALIGN_32(char *p) {
+  return((char *)((((unsigned long)p)+0x1f) & (~0x1FUL)));
+}
+
+CpvDeclare(PCQueue, broadcast_q);                 //queue to send broadcast messages
+#if CMK_NODE_QUEUE_AVAILABLE
+CsvDeclare(PCQueue, node_bcastq);
+CsvDeclare(CmiNodeLock, node_bcastLock);
+#endif
+
+//#define ENABLE_BROADCAST_THROTTLE 1
+
+/*To reduce the buffer used in broadcast and distribute the load from
+  broadcasting node, define CMK_BROADCAST_SPANNING_TREE enforce the use of
+  spanning tree broadcast algorithm.
+  This will use the fourth short in message as an indicator of spanning tree
+  root.
+*/
+#if CMK_SMP
+#define CMK_BROADCAST_SPANNING_TREE    1
+#else
+#define CMK_BROADCAST_SPANNING_TREE    1
+#endif /* CMK_SMP */
+
+#define BROADCAST_SPANNING_FACTOR     2
+
+//The root of the message infers the type of the message
+// 1. root is 0, then it is a normal point-to-point message
+// 2. root is larger than 0 (>=1), then it is a broadcast message across all processors (cores)
+// 3. root is less than 0 (<=-1), then it is a broadcast message across all nodes
+#define CMI_BROADCAST_ROOT(msg)          ((CmiMsgHeaderBasic *)msg)->root
+#define CMI_IS_BCAST_ON_CORES(msg) (CMI_BROADCAST_ROOT(msg) > 0)
+#define CMI_IS_BCAST_ON_NODES(msg) (CMI_BROADCAST_ROOT(msg) < 0)
+#define CMI_GET_CYCLE(msg)               ((CmiMsgHeaderBasic *)msg)->root
+
+#define CMI_DEST_RANK(msg)               ((CmiMsgHeaderBasic *)msg)->rank
+#define CMI_MAGIC(msg)                   ((CmiMsgHeaderBasic *)msg)->magic
+
+/* FIXME: need a random number that everyone agrees ! */
+#define CHARM_MAGIC_NUMBER               126
+
+#if !CMK_OPTIMIZE
+static int checksum_flag = 0;
+extern unsigned char computeCheckSum(unsigned char *data, int len);
+
+#define CMI_SET_CHECKSUM(msg, len)      \
+        if (checksum_flag)  {   \
+          ((CmiMsgHeaderBasic *)msg)->cksum = 0;        \
+          ((CmiMsgHeaderBasic *)msg)->cksum = computeCheckSum((unsigned char*)msg, len);        \
+        }
+
+#define CMI_CHECK_CHECKSUM(msg, len)    \
+        if (checksum_flag)      \
+          if (computeCheckSum((unsigned char*)msg, len) != 0)  { \
+            printf("\n\n------------------------------\n\nReceiver %d size %d:", CmiMyPe(), len); \
+            for(count = 0; count < len; count++) { \
+                printf("%2x", msg[count]);                 \
+            }                                             \
+            printf("------------------------------\n\n"); \
+            CmiAbort("Fatal error: checksum doesn't agree!\n"); \
+          }
+#else
+#define CMI_SET_CHECKSUM(msg, len)
+#define CMI_CHECK_CHECKSUM(msg, len)
+#endif
+
+#define CMI_SET_BROADCAST_ROOT(msg, root)  CMI_BROADCAST_ROOT(msg) = (root);
+
+#  define CMI_SET_CYCLE(msg, cycle)
+
+int               _Cmi_numpes;
+int               _Cmi_mynode;    /* Which address space am I */
+int               _Cmi_mynodesize;/* Number of processors in my address space */
+int               _Cmi_numnodes;  /* Total number of address spaces */
+int                Cmi_nodestart; /* First processor in this address space */
+CpvDeclare(void*, CmiLocalQueue);
+
+
+#if CMK_NODE_QUEUE_AVAILABLE
+#define SMP_NODEMESSAGE   (0xFB) // rank of the node message when node queue
+// is available
+#define NODE_BROADCAST_OTHERS (-1)
+#define NODE_BROADCAST_ALL    (-2)
+#endif
+
+
+typedef struct ProcState {
+    /* PCQueue      sendMsgBuf; */      /* per processor message sending queue */
+    CmiNodeLock  recvLock;              /* for cs->recv */
+    CmiNodeLock bcastLock;
+} ProcState;
+
+static ProcState  *procState;
+
+#if CMK_SMP && !CMK_MULTICORE
+//static volatile int commThdExit = 0;
+//static CmiNodeLock commThdExitLock = 0;
+#endif
+
+void ConverseRunPE(int everReturn);
+static void CommunicationServer(int sleepTime);
+static void CommunicationServerThread(int sleepTime);
+
+static void CmiNetworkBarrier();
+
+//So far we dont define any comm threads
+int Cmi_commthread = 0;
+
+#include "machine-smp.c"
+CsvDeclare(CmiNodeState, NodeState);
+#include "immediate.c"
+
+void AdvanceCommunications();
+
+
+#if !CMK_SMP
+/************ non SMP **************/
+static struct CmiStateStruct Cmi_state;
+int _Cmi_mype;
+int _Cmi_myrank;
+
+void CmiMemLock(void) {}
+void CmiMemUnlock(void) {}
+
+#define CmiGetState() (&Cmi_state)
+#define CmiGetStateN(n) (&Cmi_state)
+
+//void CmiYield(void) { sleep(0); }
+
+static void CmiStartThreads(char **argv) {
+    CmiStateInit(Cmi_nodestart, 0, &Cmi_state);
+    _Cmi_mype = Cmi_nodestart;
+    _Cmi_myrank = 0;
+}
+#endif  /* !CMK_SMP */
+
+//int received_immediate;
+//int received_broadcast;
+
+/*Add a message to this processor's receive queue, pe is a rank */
+void CmiPushPE(int pe,void *msg) {
+    CmiState cs = CmiGetStateN(pe);
+    MACHSTATE2(3,"Pushing message into rank %d's queue %p{",pe, cs->recv);
+#if CMK_IMMEDIATE_MSG
+    if (CmiIsImmediate(msg)) {
+        /**(CmiUInt2 *)msg = pe;*/
+        //received_immediate = 1;
+        //printf("PushPE: N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
+        //CMI_DEST_RANK(msg) = pe;
+        CmiPushImmediateMsg(msg);
+        return;
+    }
+#endif
+#if CMK_SMP
+    //CmiLock(procState[pe].recvLock);
+#endif
+
+    PCQueuePush(cs->recv,(char *)msg);
+    //printf("%d: PCQueue length = %d, msg = %x\n", CmiMyPe(), PCQueueLength(cs->recv), msg);
+
+#if CMK_SMP
+    //CmiUnlock(procState[pe].recvLock);
+#endif
+    CmiIdleLock_addMessage(&cs->idle);
+    MACHSTATE1(3,"} Pushing message into rank %d's queue done",pe);
+}
+
+#if CMK_NODE_QUEUE_AVAILABLE
+/*Add a message to this processor's receive queue */
+static void CmiPushNode(void *msg) {
+    MACHSTATE(3,"Pushing message into NodeRecv queue");
+#if CMK_IMMEDIATE_MSG
+    if (CmiIsImmediate(msg)) {
+        //printf("PushNode: N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
+        //CMI_DEST_RANK(msg) = 0;
+        CmiPushImmediateMsg(msg);
+        return;
+    }
+#endif
+    CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
+    PCQueuePush(CsvAccess(NodeState).NodeRecv,msg);
+    CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
+    {
+        CmiState cs=CmiGetStateN(0);
+        CmiIdleLock_addMessage(&cs->idle);
+    }
+}
+#endif /* CMK_NODE_QUEUE_AVAILABLE */
+
+#define MAX_NUM_CONTEXTS  16
+
+#if CMK_SMP 
+#define CMK_PAMI_MULTI_CONTEXT  0
+#else
+#define CMK_PAMI_MULTI_CONTEXT  0
+#endif
+
+#if CMK_PAMI_MULTI_CONTEXT
+volatile int msgQueueLen [MAX_NUM_CONTEXTS];
+volatile int outstanding_recvs [MAX_NUM_CONTEXTS];
+#define  MY_CONTEXT_ID() (CmiMyRank() >> 2)
+#define  MY_CONTEXT()    (cmi_pami_contexts[CmiMyRank() >> 2])
+
+#define  INCR_MSGQLEN()  (msgQueueLen[CmiMyRank() >> 2] ++)
+#define  DECR_MSGQLEN()  (msgQueueLen[CmiMyRank() >> 2] --)
+#define  MSGQLEN()       (msgQueueLen[CmiMyRank() >> 2])
+#define  INCR_ORECVS()   (outstanding_recvs[CmiMyRank() >> 2] ++)
+#define  DECR_ORECVS()   (outstanding_recvs[CmiMyRank() >> 2] --)
+#define  ORECVS()        (outstanding_recvs[CmiMyRank() >> 2])
+#else
+volatile int msgQueueLen;
+volatile int outstanding_recvs;
+#define  MY_CONTEXT_ID() (0)
+#define  MY_CONTEXT()    (cmi_pami_contexts[0])
+
+#define  INCR_MSGQLEN()  (msgQueueLen ++)
+#define  DECR_MSGQLEN()  (msgQueueLen --)
+#define  MSGQLEN()       (msgQueueLen)
+#define  INCR_ORECVS()   (outstanding_recvs ++)
+#define  DECR_ORECVS()   (outstanding_recvs --)
+#define  ORECVS()        (outstanding_recvs)
+#endif
+
+static char     **Cmi_argv;
+static char     **Cmi_argvcopy;
+static CmiStartFn Cmi_startfn;   /* The start function */
+static int        Cmi_usrsched;  /* Continue after start function finishes? */
+
+extern void ConverseCommonInit(char **argv);
+extern void ConverseCommonExit(void);
+extern void CthInit(char **argv);
+
+static void SendMsgsUntil(int);
+
+
+void SendSpanningChildren(int size, char *msg);
+#if CMK_NODE_QUEUE_AVAILABLE
+void SendSpanningChildrenNode(int size, char *msg);
+#endif
+
+typedef struct {
+    int sleepMs; /*Milliseconds to sleep while idle*/
+    int nIdles; /*Number of times we've been idle in a row*/
+    CmiState cs; /*Machine state*/
+} CmiIdleState;
+
+static CmiIdleState *CmiNotifyGetState(void) {
+    CmiIdleState *s=(CmiIdleState *)CmiAlloc(sizeof(CmiIdleState));
+    s->sleepMs=0;
+    s->nIdles=0;
+    s->cs=CmiGetState();
+    return s;
+}
+
+
+static void send_done(pami_context_t ctxt, void *data, pami_result_t result) 
+{
+  CmiFree(data);
+  DECR_MSGQLEN();
+}
+
+
+static void recv_done(pami_context_t ctxt, void *clientdata, pami_result_t result) 
+/* recv done callback: push the recved msg to recv queue */
+{
+    char *msg = (char *) clientdata;
+    int sndlen = ((CmiMsgHeaderBasic *) msg)->size;
+
+    //fprintf (stderr, "%d Recv message done \n", CmiMyPe());
+    /* then we do what PumpMsgs used to do:
+     * push msg to recv queue */
+    int count=0;
+    CMI_CHECK_CHECKSUM(msg, sndlen);
+    if (CMI_MAGIC(msg) != CHARM_MAGIC_NUMBER) { /* received a non-charm msg */
+        CmiAbort("Charm++ Warning: Non Charm++ Message Received. \n");
+        return;
+    }
+
+#if CMK_BROADCAST_SPANNING_TREE 
+    if (CMI_IS_BCAST_ON_CORES(msg) ) {
+      int pe = CmiMyRank(); //CMI_DEST_RANK(msg);
+        //printf ("%d: Receiving bcast message from %d with %d bytes for %d\n", CmiMyPe(), CMI_BROADCAST_ROOT(msg), sndlen, pe);
+        char *copymsg;
+        copymsg = (char *)CmiAlloc(sndlen);
+        CmiMemcpy(copymsg,msg,sndlen);
+
+        //received_broadcast = 1;
+#if CMK_SMP
+        CmiLock(procState[pe].bcastLock);
+        PCQueuePush(CpvAccessOther(broadcast_q, pe), copymsg);
+        CmiUnlock(procState[pe].bcastLock);
+#else
+        PCQueuePush(CpvAccess(broadcast_q), copymsg);
+#endif
+    }
+#endif
+
+#if CMK_NODE_QUEUE_AVAILABLE
+#if CMK_BROADCAST_SPANNING_TREE
+    if (CMI_IS_BCAST_ON_NODES(msg)) {
+        //printf ("%d: Receiving node bcast message from %d with %d bytes for %d\n", CmiMyPe(), CMI_BROADCAST_ROOT(msg), sndlen, CMI_DEST_RANK(msg));
+        char *copymsg = (char *)CmiAlloc(sndlen);
+        CmiMemcpy(copymsg,msg,sndlen);
+        //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
+        CmiLock(CsvAccess(node_bcastLock));
+        PCQueuePush(CsvAccess(node_bcastq), copymsg);
+        CmiUnlock(CsvAccess(node_bcastLock));
+        //CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
+    }
+#endif
+    if (CMI_DEST_RANK(msg) == SMP_NODEMESSAGE)
+      CmiPushNode(msg);
+    else
+#endif
+      CmiPushPE(CMI_DEST_RANK(msg), (void *)msg);
+
+    DECR_ORECVS();
+}
+
+static void pkt_dispatch (pami_context_t       context,      /**< IN: PAMI context */
+                         void               * clientdata,   /**< IN: dispatch cookie */
+                         const void         * header_addr,  /**< IN: header address */
+                         size_t               header_size,  /**< IN: header size */
+                         const void         * pipe_addr,    /**< IN: address of PAMI pipe buffer */
+                         size_t               pipe_size,    /**< IN: size of PAMI pipe buffer */
+                         pami_endpoint_t      origin,
+                         pami_recv_t         * recv)        /**< OUT: receive message structure */
+{
+    //fprintf (stderr, "Received Message of size %d %p\n", pipe_size, recv);
+    INCR_ORECVS();    
+    int alloc_size = pipe_size;
+    char * buffer  = (char *)CmiAlloc(alloc_size);
+
+    if (recv) {
+      recv->local_fn = recv_done;
+      recv->cookie   = buffer;
+      recv->type     = PAMI_TYPE_BYTE;
+      recv->addr     = buffer;
+      recv->offset   = 0;
+      recv->data_fn  = PAMI_DATA_COPY;
+    }
+    else {
+      memcpy (buffer, pipe_addr, pipe_size);
+      recv_done (NULL, buffer, PAMI_SUCCESS);
+    }
+}
+
+
+#if CMK_NODE_QUEUE_AVAILABLE
+void sendBroadcastMessagesNode() {
+    if (PCQueueLength(CsvAccess(node_bcastq))==0) return;
+    //node broadcast message could be always handled by any cores (including
+    //comm thd) on this node
+    //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
+    CmiLock(CsvAccess(node_bcastLock));
+    char *msg = PCQueuePop(CsvAccess(node_bcastq));
+    CmiUnlock(CsvAccess(node_bcastLock));
+    //CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
+    while (msg) {
+        //printf("sendBroadcastMessagesNode: node %d rank %d with msg root %d\n", CmiMyNode(), CmiMyRank(), CMI_BROADCAST_ROOT(msg));
+        SendSpanningChildrenNode(((CmiMsgHeaderBasic *) msg)->size, msg);
+        CmiFree(msg);
+        //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
+        CmiLock(CsvAccess(node_bcastLock));
+        msg = PCQueuePop(CsvAccess(node_bcastq));
+        CmiUnlock(CsvAccess(node_bcastLock));
+        //CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
+    }
+}
+#endif
+
+void sendBroadcastMessages() {
+  PCQueue toPullQ;
+  toPullQ = CpvAccess(broadcast_q);
+
+  if (PCQueueLength(toPullQ)==0) return;
+#if CMK_SMP
+  CmiLock(procState[CmiMyRank()].bcastLock);
+#endif
+
+    char *msg = (char *) PCQueuePop(toPullQ);
+
+#if CMK_SMP
+    CmiUnlock(procState[CmiMyRank()].bcastLock);
+#endif
+
+    while (msg) {
+
+#if CMK_BROADCAST_SPANNING_TREE
+        SendSpanningChildren(((CmiMsgHeaderBasic *) msg)->size, msg);
+#endif
+
+        CmiFree (msg);
+
+#if CMK_SMP
+        CmiLock(procState[CmiMyRank()].bcastLock);
+#endif
+
+        msg = (char *) PCQueuePop(toPullQ);
+
+#if CMK_SMP
+        CmiUnlock(procState[CmiMyRank()].bcastLock);
+#endif
+    }
+}
+
+
+//approx sleep command
+size_t mysleep_iter = 0;
+void mysleep (unsigned long cycles) {
+    unsigned long start = GetTimeBase();
+    unsigned long end = start + cycles;
+
+    while (start < end) {
+      mysleep_iter ++;
+      start = GetTimeBase();
+    }
+
+    return;
+}
+
+static void * test_buf;
+
+volatile int pami_barrier_flag = 0;
+
+void pami_barrier_done (void *ctxt, void * clientdata, pami_result_t err)
+{
+  int * active = (int *) clientdata;
+  (*active)--;
+}
+
+pami_client_t      cmi_pami_client;
+pami_context_t   * cmi_pami_contexts;
+size_t             cmi_pami_numcontexts;
+pami_geometry_t    world_geometry;
+pami_xfer_t        pami_barrier;
+char clientname[] = "Converse";
+
+#define CMI_PAMI_DISPATCH   10
+
+#include "malloc.h"
+
+void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) {
+    int n, i, count;
+
+    /* processor per node */
+    _Cmi_mynodesize = 1;
+    CmiGetArgInt(argv,"+ppn", &_Cmi_mynodesize);
+#if ! CMK_SMP
+    if (_Cmi_mynodesize > 1 && _Cmi_mynode == 0)
+      CmiAbort("+ppn cannot be used in non SMP version!\n");
+#endif
+    
+    PAMI_Client_create (clientname, &cmi_pami_client, NULL, 0);
+    size_t _n = 1;
+#if CMK_PAMI_MULTI_CONTEXT
+    if ((_Cmi_mynodesize % 4) == 0)
+      _n = _Cmi_mynodesize / 4;  //have a context for each four threads
+    else
+      _n = 1 + (_Cmi_mynodesize / 4);  //have a context for each four threads
+#endif
+
+    cmi_pami_contexts = (pami_context_t *) malloc (sizeof(pami_context_t) * _n);
+    PAMI_Context_createv (cmi_pami_client, NULL, 0, cmi_pami_contexts, _n);
+    cmi_pami_numcontexts = _n;
+
+    pami_configuration_t configuration;
+    pami_result_t result;
+    
+    configuration.name = PAMI_CLIENT_TASK_ID;
+    result = PAMI_Client_query(cmi_pami_client, &configuration, 1);
+    _Cmi_mynode = configuration.value.intval;
+
+    configuration.name = PAMI_CLIENT_NUM_TASKS;
+    result = PAMI_Client_query(cmi_pami_client, &configuration, 1);
+    _Cmi_numnodes = configuration.value.intval;
+
+    pami_dispatch_hint_t options = (pami_dispatch_hint_t) {0};
+    pami_dispatch_callback_function pfn;
+    pfn.p2p = pkt_dispatch;
+    for (i = 0; i < _n; ++i)
+      PAMI_Dispatch_set (cmi_pami_contexts[i],
+                        CMI_PAMI_DISPATCH,
+                        pfn,
+                        NULL,
+                        options);
+    
+    //fprintf(stderr, "%d Initializing Converse PAMI machine Layer on %d tasks\n", _Cmi_mynode, _Cmi_numnodes);
+
+    ///////////---------------------------------/////////////////////
+    //////////----------- Initialize Barrier -------////////////////
+    size_t               num_algorithm[2];
+    pami_algorithm_t    *always_works_algo = NULL;
+    pami_metadata_t     *always_works_md = NULL;
+    pami_algorithm_t    *must_query_algo = NULL;
+    pami_metadata_t     *must_query_md = NULL;
+    pami_xfer_type_t     xfer_type = PAMI_XFER_BARRIER;
+
+    /* Docs01:  Get the World Geometry */
+    result = PAMI_Geometry_world (cmi_pami_client,&world_geometry);
+    if (result != PAMI_SUCCESS)
+      {
+       fprintf (stderr, "Error. Unable to get world geometry: result = %d\n", result);
+       return;
+      }
+
+    result = PAMI_Geometry_algorithms_num(world_geometry,
+                                         xfer_type,
+                                         (size_t*)num_algorithm);
+
+    if (result != PAMI_SUCCESS || num_algorithm[0]==0)
+      {
+       fprintf (stderr,
+                "Error. Unable to query algorithm, or no algorithms available result = %d\n",
+                result);
+       return;
+      }
+
+    always_works_algo = (pami_algorithm_t*)malloc(sizeof(pami_algorithm_t)*num_algorithm[0]);
+    always_works_md  = (pami_metadata_t*)malloc(sizeof(pami_metadata_t)*num_algorithm[0]);
+    must_query_algo   = (pami_algorithm_t*)malloc(sizeof(pami_algorithm_t)*num_algorithm[1]);
+    must_query_md    = (pami_metadata_t*)malloc(sizeof(pami_metadata_t)*num_algorithm[1]);
+
+    /* Docs05:  Query the algorithm lists */
+    result = PAMI_Geometry_algorithms_query(world_geometry,
+                                           xfer_type,
+                                           always_works_algo,
+                                           always_works_md,
+                                           num_algorithm[0],
+                                           must_query_algo,
+                                           must_query_md,
+                                           num_algorithm[1]);
+    pami_barrier.cb_done   = pami_barrier_done;
+    pami_barrier.cookie    = (void*) & pami_barrier_flag;
+    pami_barrier.algorithm = always_works_algo[0];
+
+    /* Docs06:  Query the algorithm lists */
+    if (result != PAMI_SUCCESS)
+      {
+       fprintf (stderr, "Error. Unable to get query algorithm. result = %d\n", result);
+       return;
+      }
+
+    CmiNetworkBarrier();
+    CmiNetworkBarrier();
+    CmiNetworkBarrier();
+
+    _Cmi_numpes = _Cmi_numnodes * _Cmi_mynodesize;
+    Cmi_nodestart = _Cmi_mynode * _Cmi_mynodesize;
+    Cmi_argvcopy = CmiCopyArgs(argv);
+    Cmi_argv = argv;
+    Cmi_startfn = fn;
+    Cmi_usrsched = usched;
+
+    /* checksum flag */
+    if (CmiGetArgFlag(argv,"+checksum")) {
+#if !CMK_OPTIMIZE
+        checksum_flag = 1;
+        if (_Cmi_mynode == 0) CmiPrintf("Charm++: CheckSum checking enabled! \n");
+#else
+        if (_Cmi_mynode == 0) CmiPrintf("Charm++: +checksum ignored in optimized version! \n");
+#endif
+    }
+
+    CsvInitialize(CmiNodeState, NodeState);
+    CmiNodeStateInit(&CsvAccess(NodeState));
+
+#if CMK_NODE_QUEUE_AVAILABLE
+    CsvInitialize(PCQueue, node_bcastq);
+    CsvAccess(node_bcastq) = PCQueueCreate();
+    CsvInitialize(CmiNodeLock, node_bcastLock);
+    CsvAccess(node_bcastLock) = CmiCreateLock();
+#endif
+
+    int actualNodeSize = _Cmi_mynodesize;
+#if !CMK_MULTICORE
+    actualNodeSize++; //considering the extra comm thread
+#endif
+
+    procState = (ProcState *)CmiAlloc((actualNodeSize) * sizeof(ProcState));
+    for (i=0; i<actualNodeSize; i++) {
+        /*    procState[i].sendMsgBuf = PCQueueCreate();   */
+        procState[i].recvLock = CmiCreateLock();
+        procState[i].bcastLock = CmiCreateLock();
+    }
+
+#if CMK_SMP && !CMK_MULTICORE
+    //commThdExitLock = CmiCreateLock();
+#endif
+
+    //printf ("Starting Threads\n");
+    CmiStartThreads(argv);
+    ConverseRunPE(initret);
+}
+
+
+int PerrorExit (char *err) {
+  fprintf (stderr, "err\n\n");
+    exit (-1);
+    return -1;
+}
+
+
+void ConverseRunPE(int everReturn) {
+    //printf ("ConverseRunPE on rank %d\n", CmiMyPe());
+
+    CmiIdleState *s=CmiNotifyGetState();
+    CmiState cs;
+    char** CmiMyArgv;
+    CmiNodeAllBarrier();
+
+    cs = CmiGetState();
+    CpvInitialize(void *,CmiLocalQueue);
+    CpvAccess(CmiLocalQueue) = cs->localqueue;
+
+    if (CmiMyRank())
+        CmiMyArgv=CmiCopyArgs(Cmi_argvcopy);
+    else
+        CmiMyArgv=Cmi_argv;
+
+    CthInit(CmiMyArgv);
+
+    CpvInitialize(PCQueue, broadcast_q);
+    CpvAccess(broadcast_q) = PCQueueCreate();
+
+    //printf ("Before Converse Common Init\n");
+    ConverseCommonInit(CmiMyArgv);
+
+    CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyIdle,NULL);
+
+    CmiBarrier();
+
+    /* Converse initialization finishes, immediate messages can be processed.
+       node barrier previously should take care of the node synchronization */
+    _immediateReady = 1;
+
+    if (!everReturn) {
+      Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
+      if (Cmi_usrsched==0) CsdScheduler(-1);
+      ConverseExit();
+    }
+}
+
+#if CMK_SMP
+static int inexit = 0;
+
+/* test if all processors recv queues are empty */
+static int RecvQueueEmpty() {
+    int i;
+    for (i=0; i<_Cmi_mynodesize; i++) {
+        CmiState cs=CmiGetStateN(i);
+        if (!PCQueueEmpty(cs->recv)) return 0;
+    }
+    return 1;
+}
+
+#endif
+
+
+void ConverseExit(void) {
+
+    while (MSGQLEN() > 0 || ORECVS() > 0) {
+      AdvanceCommunications();
+    }
+    
+    CmiNodeBarrier();
+    ConverseCommonExit();
+
+    if (CmiMyPe() == 0) {
+        printf("End of program\n");
+    }
+
+    CmiNodeBarrier();
+//  CmiNodeAllBarrier ();
+
+    int rank0 = 0;
+    if (CmiMyRank() == 0) {
+        rank0 = 1;
+        //CmiFree(procState);
+       PAMI_Context_destroyv(cmi_pami_contexts, cmi_pami_numcontexts);
+       PAMI_Client_destroy(&cmi_pami_client);
+    }
+
+    CmiNodeBarrier();
+    //  CmiNodeAllBarrier ();
+    //fprintf(stderr, "Before Exit\n");
+#if CMK_SMP
+    if (rank0)
+      exit(1);
+    else
+      pthread_exit(0);
+#else
+    exit(0);
+#endif
+}
+
+/* exit() called on any node would abort the whole program */
+void CmiAbort(const char * message) {
+    CmiError("------------- Processor %d Exiting: Called CmiAbort ------------\n"
+             "{snd:%d,rcv:%d} Reason: %s\n",CmiMyPe(),
+             MSGQLEN(), ORECVS(), message);
+
+    //CmiPrintStackTrace(0);
+    //while (msgQueueLen > 0 || outstanding_recvs > 0) {
+    //  AdvanceCommunications();
+    //}    
+    //CmiBarrier();
+    assert (0);
+}
+
+#if CMK_NODE_QUEUE_AVAILABLE
+char *CmiGetNonLocalNodeQ(void) {
+    CmiState cs = CmiGetState();
+    char *result = 0;
+    CmiIdleLock_checkMessage(&cs->idle);
+    if (!PCQueueEmpty(CsvAccess(NodeState).NodeRecv)) {
+        MACHSTATE1(3,"CmiGetNonLocalNodeQ begin %d {", CmiMyPe());
+
+        if (CmiTryLock(CsvAccess(NodeState).CmiNodeRecvLock) == 0) {
+            //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
+            result = (char *) PCQueuePop(CsvAccess(NodeState).NodeRecv);
+            CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
+        }
+
+        MACHSTATE1(3,"} CmiGetNonLocalNodeQ end %d ", CmiMyPe());
+    }
+    return result;
+}
+#endif
+
+
+void *CmiGetNonLocal() {
+
+    CmiState cs = CmiGetState();
+
+    void *msg = NULL;
+    CmiIdleLock_checkMessage(&cs->idle);
+    /* although it seems that lock is not needed, I found it crashes very often
+       on mpi-smp without lock */
+
+    /*if(CmiMyRank()==0) printf("Got stuck here on proc[%d] node[%d]\n", CmiMyPe(), CmiMyNode());*/
+
+    if (PCQueueLength(cs->recv)==0)
+      AdvanceCommunications();
+
+    if (PCQueueLength(cs->recv)==0) return NULL;
+
+#if CMK_SMP
+    //CmiLock(procState[cs->rank].recvLock);
+#endif
+
+    msg =  PCQueuePop(cs->recv);
+
+#if CMK_SMP
+    //CmiUnlock(procState[cs->rank].recvLock);
+#endif
+
+    return msg;
+}
+
+static void CmiSendSelf(char *msg) {
+#if CMK_IMMEDIATE_MSG
+    if (CmiIsImmediate(msg)) {
+        /* CmiBecomeNonImmediate(msg); */
+        //printf("In SendSelf, N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
+        CmiPushImmediateMsg(msg);
+#if CMK_MULTICORE
+        CmiHandleImmediate();
+#endif
+        return;
+    }
+#endif
+    
+    CdsFifo_Enqueue(CpvAccess(CmiLocalQueue),msg);
+}
+
+#if CMK_SMP
+static void CmiSendPeer (int rank, int size, char *msg) {
+#if CMK_BROADCAST_SPANNING_TREE
+    if (CMI_BROADCAST_ROOT(msg) != 0) {
+        char *copymsg;
+        copymsg = (char *)CmiAlloc(size);
+        CmiMemcpy(copymsg,msg,size);
+
+        CmiLock(procState[rank].bcastLock);
+        PCQueuePush(CpvAccessOther(broadcast_q, rank), copymsg);
+        CmiUnlock(procState[rank].bcastLock);
+    }
+#endif
+    
+    CmiPushPE (rank, msg);
+}
+#endif
+
+
+void CmiGeneralFreeSendN (int node, int rank, int size, char * msg);
+
+
+/* The general free send function
+ * Send is synchronous, and free msg after posted
+ */
+void  CmiGeneralFreeSend(int destPE, int size, char* msg) {
+
+  if (destPE < 0 || destPE > CmiNumPes ())
+    printf ("Sending to %d\n", destPE);
+
+  CmiAssert (destPE >= 0 && destPE < CmiNumPes());
+
+    CmiState cs = CmiGetState();
+
+    if (destPE==cs->pe) {
+        CmiSendSelf(msg);
+        return;
+    }
+
+    CmiGeneralFreeSendN (CmiNodeOf (destPE), CmiRankOf (destPE), size, msg);
+}
+
+void CmiGeneralFreeSendN (int node, int rank, int size, char * msg) {
+
+    //printf ("%d, %d: Sending Message to node %d rank %d \n", CmiMyPe(),
+    //  CmiMyNode(), node, rank);
+
+#if CMK_SMP
+    CMI_DEST_RANK(msg) = rank;
+    //CMI_SET_CHECKSUM(msg, size);
+
+    if (node == CmiMyNode()) {
+        CmiSendPeer (rank, size, msg);
+        return;
+    }
+#endif
+
+    pami_endpoint_t target;
+#if CMK_PAMI_MULTI_CONTEXT
+    size_t dst_context = (rank != SMP_NODEMESSAGE) ? (rank>>2) : 0;
+#else
+    size_t dst_context = 0;
+#endif
+    PAMI_Endpoint_create (cmi_pami_client, (pami_task_t)node, dst_context, &target);
+
+    //fprintf (stderr, "Calling PAMI Send to %d magic %d size %d\n", node, CMI_MAGIC(msg), size);
+    if (size < 128) {
+      pami_send_immediate_t parameters;
+      parameters.dispatch        = CMI_PAMI_DISPATCH;
+      parameters.header.iov_base = NULL;
+      parameters.header.iov_len  = 0;
+      parameters.data.iov_base   = msg;
+      parameters.data.iov_len    = size;
+      parameters.dest = target;
+      
+      pami_context_t my_context = MY_CONTEXT();
+      CmiAssert (my_context != NULL);
+
+#if CMK_SMP
+      PAMI_Context_lock(my_context);
+#endif
+      PAMI_Send_immediate (my_context, &parameters);
+#if CMK_SMP
+      PAMI_Context_unlock(my_context);
+#endif
+      CmiFree(msg);
+    }
+    else {
+      pami_send_t parameters;
+      parameters.send.dispatch        = CMI_PAMI_DISPATCH;
+      parameters.send.header.iov_base = NULL;
+      parameters.send.header.iov_len  = 0;
+      parameters.send.data.iov_base   = msg;
+      parameters.send.data.iov_len    = size;
+      parameters.events.cookie        = msg;
+      parameters.events.local_fn      = send_done;
+      parameters.events.remote_fn     = NULL;
+      memset(&parameters.send.hints, 0, sizeof(parameters.send.hints));
+      parameters.send.dest = target;
+
+      pami_context_t my_context = MY_CONTEXT();
+      CmiAssert (my_context != NULL);
+      
+#if CMK_SMP
+      PAMI_Context_lock(my_context);
+#endif
+      INCR_MSGQLEN();
+      PAMI_Send (my_context, &parameters);
+#if CMK_SMP
+      PAMI_Context_unlock(my_context);
+#endif
+    }
+}
+
+void CmiSyncSendFn(int destPE, int size, char *msg) {
+    char *copymsg;
+    copymsg = (char *)CmiAlloc(size);
+    CmiMemcpy(copymsg,msg,size);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncSendFn on comm thd on node %d\n", CmiMyNode());
+    CmiFreeSendFn(destPE,size,copymsg);
+}
+
+void CmiFreeSendFn(int destPE, int size, char *msg) {    
+    CQdCreate(CpvAccess(cQdState), 1);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeSendFn on comm thd on node %d\n", CmiMyNode());
+
+    CMI_SET_BROADCAST_ROOT(msg,0);
+    CMI_MAGIC(msg) = CHARM_MAGIC_NUMBER;
+    ((CmiMsgHeaderBasic *)msg)->size = size;
+    CMI_SET_CHECKSUM(msg, size);
+
+    CmiGeneralFreeSend(destPE,size,msg);
+}
+
+/* same as CmiSyncSendFn, but don't set broadcast root in msg header */
+void CmiSyncSendFn1(int destPE, int size, char *msg) {
+    char *copymsg;
+    copymsg = (char *)CmiAlloc(size);
+    CmiMemcpy(copymsg, msg, size);
+
+    //  asm volatile("sync" ::: "memory");
+
+    CMI_MAGIC(copymsg) = CHARM_MAGIC_NUMBER;
+    ((CmiMsgHeaderBasic *)copymsg)->size = size;
+    CMI_SET_CHECKSUM(copymsg, size);
+
+    CmiGeneralFreeSend(destPE,size,copymsg);
+}
+
+/* send msg to its spanning children in broadcast. G. Zheng */
+void SendSpanningChildren(int size, char *msg) {
+    int startnode = CMI_BROADCAST_ROOT(msg)-1;
+    int myrank = CMI_DEST_RANK(msg);
+    int i;
+
+    //printf ("%d [%d]: In Send Spanning Tree\n",  CmiMyPe(), CmiMyNode());
+
+    CmiAssert(startnode>=0 && startnode<_Cmi_numnodes);
+    int dist = CmiMyNode() - startnode;
+    if (dist < 0) dist+=_Cmi_numnodes;
+    for (i=1; i <= BROADCAST_SPANNING_FACTOR; i++) {
+        int p = BROADCAST_SPANNING_FACTOR*dist + i;
+        if (p > _Cmi_numnodes - 1) break;
+        p += startnode;
+        p = p%_Cmi_numnodes;
+        CmiAssert(p>=0 && p<_Cmi_numnodes && p!= CmiMyNode());
+
+       char *copymsg = (char *)CmiAlloc(size);
+       CmiMemcpy(copymsg, msg, size);
+
+       CMI_MAGIC(copymsg) = CHARM_MAGIC_NUMBER;
+       ((CmiMsgHeaderBasic *)copymsg)->size = size;
+       CMI_SET_CHECKSUM(copymsg, size);
+       
+       CmiGeneralFreeSendN(p,0,size,copymsg);  
+    }    
+
+#if CMK_SMP    
+    //Send data within the nodes
+    for (i =0; i < _Cmi_mynodesize; ++i) {
+      if (i != myrank) {
+       char *copymsg = (char *)CmiAlloc(size);
+       CmiMemcpy(copymsg, msg, size);                  
+       CmiPushPE (i, copymsg);
+      }
+    }
+#endif
+}
+
+void CmiSyncBroadcastFn(int size, char *msg) {
+    char *copymsg;
+    copymsg = (char *)CmiAlloc(size);
+    CmiMemcpy(copymsg,msg,size);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncBroadcastFn on comm thd on node %d\n", CmiMyNode());
+    CmiFreeBroadcastFn(size,copymsg);
+}
+
+void CmiFreeBroadcastFn(int size, char *msg) {
+
+    //  printf("%d: Calling Broadcast %d\n", CmiMyPe(), size);
+
+    CmiState cs = CmiGetState();
+#if CMK_BROADCAST_SPANNING_TREE    
+    CQdCreate(CpvAccess(cQdState), CmiNumPes()-1);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeBroadcastFn on comm thd on node %d\n", CmiMyNode());
+
+    //printf ("%d: Starting Spanning Tree Broadcast of size %d bytes\n", CmiMyPe(), size);
+
+    CMI_SET_BROADCAST_ROOT(msg, CmiMyNode()+1);
+    CMI_DEST_RANK(msg) = CmiMyRank();
+    SendSpanningChildren(size, msg);
+    CmiFree(msg);
+#else
+    int i;
+
+    for ( i=cs->pe+1; i<_Cmi_numpes; i++ )
+        CmiSyncSendFn(i,size,msg);
+
+    for ( i=0; i<cs->pe; i++ )
+        CmiSyncSendFn(i,size,msg);
+
+    CmiFree(msg);
+#endif
+}
+
+void CmiSyncBroadcastAllFn(int size, char *msg) {
+    char *copymsg;
+    copymsg = (char *)CmiAlloc(size);
+    CmiMemcpy(copymsg,msg,size);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncBroadcastAllFn on comm thd on node %d\n", CmiMyNode());
+    CmiFreeBroadcastAllFn(size,copymsg);
+}
+
+void CmiFreeBroadcastAllFn(int size, char *msg) {
+
+    //printf("%d: Calling All Broadcast %d\n", CmiMyPe(), size);
+
+    CmiState cs = CmiGetState();
+#if CMK_BROADCAST_SPANNING_TREE
+
+    //printf ("%d: Starting Spanning Tree Broadcast of size %d bytes\n", CmiMyPe(), size);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeBroadcastAllFn on comm thd on node %d\n", CmiMyNode());
+
+    CmiSyncSendFn(cs->pe,size,msg);
+    
+    CQdCreate(CpvAccess(cQdState), CmiNumPes()-1);
+
+    CMI_SET_BROADCAST_ROOT(msg, CmiMyNode()+1);
+    CMI_DEST_RANK(msg) = CmiMyRank();
+    SendSpanningChildren(size, msg);
+    CmiFree(msg);
+#else
+    int i ;
+
+    for ( i=0; i<_Cmi_numpes; i++ ) {
+        CmiSyncSendFn(i,size,msg);      
+    }
+    //SendMsgsUntil (0);
+
+    CmiFree(msg);
+#endif
+}
+
+void AdvanceCommunications() {
+
+    pami_context_t my_context = MY_CONTEXT();
+   
+#if CMK_SMP
+    CmiAssert (my_context != NULL);
+    PAMI_Context_trylock_advancev(&my_context, 1, 1);
+#else
+    PAMI_Context_advance(my_context, 1);
+#endif
+    
+    sendBroadcastMessages();
+#if CMK_NODE_QUEUE_AVAILABLE
+    sendBroadcastMessagesNode();
+#endif
+    
+    
+#if CMK_IMMEDIATE_MSG && CMK_MULTICORE
+    CmiHandleImmediate();
+#endif
+}
+
+#if 0
+static void SendMsgsUntil(int targetm) {
+
+    pami_context_t my_context = MY_CONTEXT();
+
+    while (MSGQLEN() > targetm) {
+#if CMK_SMP
+      PAMI_Context_trylock_advancev(&my_context, 1, 1);
+#else
+      PAMI_Context_advance(my_context, 1);
+#endif    
+    }
+}
+#endif
+
+void CmiNotifyIdle() {
+  AdvanceCommunications();
+}
+
+
+/*==========================================================*/
+/*==========================================================*/
+/*==========================================================*/
+
+/************ Recommended routines ***********************/
+/************ You dont have to implement these but they are supported
+ in the converse syntax and some rare programs may crash. But most
+ programs dont need them. *************/
+
+CmiCommHandle CmiAsyncSendFn(int dest, int size, char *msg) {
+    CmiAbort("CmiAsyncSendFn not implemented.");
+    return (CmiCommHandle) 0;
+}
+
+CmiCommHandle CmiAsyncBroadcastFn(int size, char *msg) {
+    CmiAbort("CmiAsyncBroadcastFn not implemented.");
+    return (CmiCommHandle) 0;
+}
+
+CmiCommHandle CmiAsyncBroadcastAllFn(int size, char *msg) {
+    CmiAbort("CmiAsyncBroadcastAllFn not implemented.");
+    return (CmiCommHandle) 0;
+}
+
+int           CmiAsyncMsgSent(CmiCommHandle handle) {
+    CmiAbort("CmiAsyncMsgSent not implemented.");
+    return 0;
+}
+void          CmiReleaseCommHandle(CmiCommHandle handle) {
+    CmiAbort("CmiReleaseCommHandle not implemented.");
+}
+
+
+/*==========================================================*/
+/*==========================================================*/
+/*==========================================================*/
+
+/* Optional routines which could use common code which is shared with
+   other machine layer implementations. */
+
+/* MULTICAST/VECTOR SENDING FUNCTIONS
+
+ * In relations to some flags, some other delivery functions may be needed.
+ */
+
+#if ! CMK_MULTICAST_LIST_USE_COMMON_CODE
+
+void CmiSyncListSendFn(int npes, int *pes, int size, char *msg) {
+    char *copymsg;
+    copymsg = (char *)CmiAlloc(size);
+    CmiMemcpy(copymsg,msg,size);
+    CmiFreeListSendFn(npes, pes, size, msg);
+}
+
+//#define OPTIMIZED_MULTICAST  0
+
+void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) {
+
+    CMI_SET_BROADCAST_ROOT(msg,0);
+    CMI_MAGIC(msg) = CHARM_MAGIC_NUMBER;
+    ((CmiMsgHeaderBasic *)msg)->size = size;
+    CMI_SET_CHECKSUM(msg, size);
+
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeListSendFn on comm thd on node %d\n", CmiMyNode());
+
+    //printf("%d: In Free List Send Fn\n", CmiMyPe());
+    int new_npes = 0;
+
+    int i, count = 0, my_loc = -1;
+    for (i=0; i<npes; i++) {
+        if (CmiNodeOf(pes[i]) == CmiMyNode()) 
+            CmiSyncSend(pes[i], size, msg);
+    }
+
+    for (i=0;i<npes;i++) {
+        if (CmiNodeOf(pes[i]) == CmiMyNode());
+        else if (i < npes - 1) {
+#if !CMK_SMP 
+            CmiReference(msg);
+            CmiGeneralFreeSend(pes[i], size, msg);
+#else
+            CmiSyncSend(pes[i], size, msg);
+#endif
+        }
+    }
+
+    if (npes  && CmiNodeOf(pes[npes-1]) != CmiMyNode())
+      CmiSyncSendAndFree(pes[npes-1], size, msg); //Sameto CmiFreeSendFn
+    else
+      CmiFree(msg);    
+}
+
+CmiCommHandle CmiAsyncListSendFn(int npes, int *pes, int size, char *msg) {
+    CmiAbort("CmiAsyncListSendFn not implemented.");
+    return (CmiCommHandle) 0;
+}
+#endif
+
+/** NODE SENDING FUNCTIONS
+
+ * If there is a node queue, and we consider also nodes as entity (tipically in
+ * SMP versions), these functions are needed.
+ */
+
+#if CMK_NODE_QUEUE_AVAILABLE
+
+void          CmiSyncNodeSendFn(int, int, char *);
+CmiCommHandle CmiAsyncNodeSendFn(int, int, char *);
+void          CmiFreeNodeSendFn(int, int, char *);
+
+void          CmiSyncNodeBroadcastFn(int, char *);
+CmiCommHandle CmiAsyncNodeBroadcastFn(int, char *);
+void          CmiFreeNodeBroadcastFn(int, char *);
+
+void          CmiSyncNodeBroadcastAllFn(int, char *);
+CmiCommHandle CmiAsyncNodeBroadcastAllFn(int, char *);
+void          CmiFreeNodeBroadcastAllFn(int, char *);
+
+#endif
+
+
+#if CMK_SHARED_VARS_POSIX_THREADS_SMP
+
+int CmiMyPe();
+int CmiMyRank();
+int CmiNodeFirst(int node);
+int CmiNodeSize(int node);
+int CmiNodeOf(int pe);
+int CmiRankOf(int pe);
+
+int CmiMyPe(void) {
+    return CmiGetState()->pe;
+}
+
+int CmiMyRank(void) {
+    return CmiGetState()->rank;
+}
+
+int CmiNodeFirst(int node) {
+    return node*_Cmi_mynodesize;
+}
+int CmiNodeSize(int node)  {
+    return _Cmi_mynodesize;
+}
+
+int CmiNodeOf(int pe)      {
+    return (pe/_Cmi_mynodesize);
+}
+int CmiRankOf(int pe)      {
+    return pe%_Cmi_mynodesize;
+}
+
+
+/* optional, these functions are implemented in "machine-smp.c", so including
+   this file avoid the necessity to reimplement them.
+ */
+void CmiNodeBarrier(void);
+void CmiNodeAllBarrier(void);
+CmiNodeLock CmiCreateLock();
+void CmiDestroyLock(CmiNodeLock lock);
+
+#endif
+
+/** IMMEDIATE MESSAGES
+
+ * If immediate messages are supported, the following function is needed. There
+ * is an exeption if the machine progress is also defined (see later for this).
+
+ * Moreover, the file "immediate.c" should be included, otherwise all its
+ * functions and variables have to be redefined.
+*/
+
+#if CMK_CCS_AVAILABLE
+
+#include "immediate.c"
+
+#if ! CMK_MACHINE_PROGRESS_DEFINED /* Hack for some machines */
+void CmiProbeImmediateMsg();
+#endif
+
+#endif
+
+
+/* Dummy implementation */
+extern int CmiBarrier() {
+  CmiNodeBarrier();
+  if (CmiMyRank() == 0)
+    CmiNetworkBarrier();
+  CmiNodeBarrier();
+  return 0;
+}
+
+static void CmiNetworkBarrier() {
+    //mysleep(1000000000UL);
+
+    pami_result_t result;
+    pami_barrier_flag = 1;
+    pami_context_t my_context = cmi_pami_contexts[0];
+#if CMK_SMP
+    PAMI_Context_lock(my_context);
+#endif
+    result = PAMI_Collective(my_context, &pami_barrier);
+    
+#if CMK_SMP
+    PAMI_Context_unlock(my_context);
+#endif    
+    
+    if (result != PAMI_SUCCESS)
+    {
+      fprintf (stderr, "Error. Unable to issue  collective. result = %d\n", result);
+      return;
+    }
+    
+#if CMK_SMP
+    PAMI_Context_lock(my_context);
+#endif
+    while (pami_barrier_flag)
+      result = PAMI_Context_advance (my_context, 100);
+#if CMK_SMP
+    PAMI_Context_unlock(my_context);
+#endif
+}
+
+#if CMK_NODE_QUEUE_AVAILABLE
+static void CmiSendNodeSelf(char *msg) {
+#if CMK_IMMEDIATE_MSG
+    if (CmiIsImmediate(msg)) {
+        //printf("SendNodeSelf: N[%d]P[%d]R[%d] received an imm msg with hdl: %p\n", CmiMyNode(), CmiMyPe(), CmiMyRank(), CmiGetHandler(msg));
+        CmiPushImmediateMsg(msg);
+#if CMK_MULTICORE
+        CmiHandleImmediate();
+#endif
+        return;
+    }
+#endif    
+    CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
+    PCQueuePush(CsvAccess(NodeState).NodeRecv, msg);
+    CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
+}
+
+CmiCommHandle CmiAsyncNodeSendFn(int dstNode, int size, char *msg) {
+    CmiAbort ("Async Node Send not supported\n");
+}
+
+void CmiFreeNodeSendFn(int node, int size, char *msg) {
+
+    CMI_SET_BROADCAST_ROOT(msg,0);
+    CMI_MAGIC(msg) = CHARM_MAGIC_NUMBER;
+    ((CmiMsgHeaderBasic *)msg)->size = size;
+    CMI_SET_CHECKSUM(msg, size);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeNodeSendFn on comm thd on node %d\n", CmiMyNode());
+    
+    CQdCreate(CpvAccess(cQdState), 1);
+
+    if (node == _Cmi_mynode) {
+        CmiSendNodeSelf(msg);
+    } else {
+        CmiGeneralFreeSendN(node, SMP_NODEMESSAGE, size, msg);
+    }
+}
+
+void CmiSyncNodeSendFn(int p, int s, char *m) {
+    char *dupmsg;
+    dupmsg = (char *)CmiAlloc(s);
+    CmiMemcpy(dupmsg,m,s);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncNodeSendFn on comm thd on node %d\n", CmiMyNode());
+    CmiFreeNodeSendFn(p, s, dupmsg);
+}
+
+CmiCommHandle CmiAsyncNodeBroadcastFn(int s, char *m) {
+    return NULL;
+}
+
+void SendSpanningChildrenNode(int size, char *msg) {
+    int startnode = -CMI_BROADCAST_ROOT(msg)-1;
+    //printf("on node %d rank %d, send node spanning children with root %d\n", CmiMyNode(), CmiMyRank(), startnode);
+    assert(startnode>=0 && startnode<CmiNumNodes());
+
+    int dist = CmiMyNode()-startnode;
+    if (dist<0) dist += CmiNumNodes();
+    int i;
+    for (i=1; i <= BROADCAST_SPANNING_FACTOR; i++) {
+        int nid = BROADCAST_SPANNING_FACTOR*dist + i;
+        if (nid > CmiNumNodes() - 1) break;
+        nid += startnode;
+        nid = nid%CmiNumNodes();
+        assert(nid>=0 && nid<CmiNumNodes() && nid!=CmiMyNode());
+        char *dupmsg = (char *)CmiAlloc(size);
+        CmiMemcpy(dupmsg,msg,size);
+        //printf("In SendSpanningChildrenNode, sending bcast msg (root %d) from node %d to node %d\n", startnode, CmiMyNode(), nid);
+        CmiGeneralFreeSendN(nid, SMP_NODEMESSAGE, size, dupmsg);
+    }
+}
+
+/* need */
+void CmiFreeNodeBroadcastFn(int s, char *m) {
+  //printf("%d: In FreeNodeBroadcastAllFn\n", CmiMyPe());
+
+#if CMK_BROADCAST_SPANNING_TREE
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeNodeBcastFn on comm thd on node %d\n", CmiMyNode());
+    
+    CQdCreate(CpvAccess(cQdState), CmiNumNodes()-1);
+
+    int mynode = CmiMyNode();
+    CMI_SET_BROADCAST_ROOT(m, -mynode-1);
+    CMI_MAGIC(m) = CHARM_MAGIC_NUMBER;
+    ((CmiMsgHeaderBasic *)m)->size = s;
+    CMI_SET_CHECKSUM(m, s);
+    //printf("In CmiFreeNodeBroadcastFn, sending bcast msg from root node %d\n", CMI_BROADCAST_ROOT(m));
+
+    SendSpanningChildrenNode(s, m);
+#else
+    int i;
+    for (i=0; i<CmiNumNodes(); i++) {
+        if (i==CmiMyNode()) continue;
+        char *dupmsg = (char *)CmiAlloc(s);
+        CmiMemcpy(dupmsg,m,s);
+        CmiFreeNodeSendFn(i, s, dupmsg);
+    }
+#endif
+    CmiFree(m);    
+}
+
+void CmiSyncNodeBroadcastFn(int s, char *m) {
+    char *dupmsg;
+    dupmsg = (char *)CmiAlloc(s);
+    CmiMemcpy(dupmsg,m,s);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncNodeBcastFn on comm thd on node %d\n", CmiMyNode());
+    CmiFreeNodeBroadcastFn(s, dupmsg);
+}
+
+/* need */
+void CmiFreeNodeBroadcastAllFn(int s, char *m) {
+  
+    char *dupmsg = (char *)CmiAlloc(s);
+    CmiMemcpy(dupmsg,m,s);
+    CMI_MAGIC(dupmsg) = CHARM_MAGIC_NUMBER;
+    ((CmiMsgHeaderBasic *)dupmsg)->size = s;
+    CMI_SET_CHECKSUM(dupmsg, s);
+
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiFreeNodeBcastAllFn on comm thd on node %d\n", CmiMyNode());
+    
+    CQdCreate(CpvAccess(cQdState), 1);
+    CmiSendNodeSelf(dupmsg);
+
+    CmiFreeNodeBroadcastFn(s, m);
+}
+
+void CmiSyncNodeBroadcastAllFn(int s, char *m) {
+    char *dupmsg;
+    dupmsg = (char *)CmiAlloc(s);
+    CmiMemcpy(dupmsg,m,s);
+    //if(CmiMyRank()==CmiMyNodeSize()) printf("CmiSyncNodeBcastAllFn on comm thd on node %d\n", CmiMyNode());
+    CmiFreeNodeBroadcastAllFn(s, dupmsg);
+}
+
+
+CmiCommHandle CmiAsyncNodeBroadcastAllFn(int s, char *m) {
+    return NULL;
+}
+#endif //end of CMK_NODE_QUEUE_AVAILABLE
+
+
+//void bzero (void *__s, size_t __n) {
+//  memset(__s, 0, __n);
+//}
+
index 4db1a71ab7326ad32a80990b5b3b88cebfd726ca..4b8f8defca7c438eda13f4f605b4e2f9d55aeab2 100644 (file)
@@ -4,6 +4,7 @@
 #define CMK_AMD64                                          1
 #define CMK_64BIT                                          1
 
+#undef CMK_CCS_AVAILABLE
 #define CMK_CCS_AVAILABLE                                  0
 
 #define CMK_CMIDELIVERS_USE_COMMON_CODE                    1
index 0f965a7fd0724b833257ff678ede1591323cf6a2..0cb6a4d7c7747f19d63965bb90686060dfa0f1bc 100644 (file)
@@ -1,3 +1,9 @@
+/**
+ *        functions for broadcast
+**/
+
+CmiCommHandle CmiSendNetworkFunc(int destNode, int size, char *msg, int mode);
+
 static void handleOneBcastMsg(int size, char *msg) {
     CmiAssert(CMI_BROADCAST_ROOT(msg)!=0);
 #if CMK_OFFLOAD_BCAST_PROCESS
@@ -106,10 +112,10 @@ static void SendSpanningChildren(int size, char *msg, int rankToAssign, int star
         CmiAssert(nd>=0 && nd!=CmiMyNode());
 #if CMK_BROADCAST_USE_CMIREFERENCE
         CmiReference(msg);
-        LrtsSendNetworkFunc(nd, size, msg, BCAST_SYNC);
+        CmiSendNetworkFunc(nd, size, msg, BCAST_SYNC);
 #else
         newmsg = CopyMsg(msg, size);
-        LrtsSendNetworkFunc(nd, size, newmsg, BCAST_SYNC);
+        CmiSendNetworkFunc(nd, size, newmsg, BCAST_SYNC);
 #endif
     }
     CMI_DEST_RANK(msg) = oldRank;
@@ -154,10 +160,10 @@ static void SendHyperCube(int size,  char *msg, int rankToAssign, int startNode)
         CmiAssert(nd>=0 && nd!=CmiMyNode());
 #if CMK_BROADCAST_USE_CMIREFERENCE
         CmiReference(msg);
-        LrtsSendNetworkFunc(nd, size, msg, BCAST_SYNC);
+        CmiSendNetworkFunc(nd, size, msg, BCAST_SYNC);
 #else
         char *newmsg = CopyMsg(msg, size);
-        LrtsSendNetworkFunc(nd, size, newmsg, BCAST_SYNC);
+        CmiSendNetworkFunc(nd, size, newmsg, BCAST_SYNC);
 #endif
     }
     CMI_DEST_RANK(msg) = oldRank;
@@ -354,3 +360,22 @@ void CmiFreeNodeBroadcastAllFn(int size, char *msg) {
 }
 #endif
 /* ##### End of Functions Related with Message Sending OPs ##### */
+
+#if ! CMK_MULTICAST_LIST_USE_COMMON_CODE
+
+void CmiSyncListSendFn(int npes, int *pes, int len, char *msg)
+{
+    LrtsSyncListSendFn(npes, pes, len, msg);
+}
+
+CmiCommHandle CmiAsyncListSendFn(int npes, int *pes, int len, char *msg)
+{
+    return LrtsAsyncListSendFn(npes, pes, len, msg);
+}
+
+void CmiFreeListSendFn(int npes, int *pes, int len, char *msg)
+{
+    LrtsFreeListSendFn(npes, pes, len, msg);
+}
+
+#endif
index d7356279c2d78fa1e195a7059d6ebb9dd8ed2f04..a950e5ea1d01abc5dba0e703c928ceeb66fc0fdf 100644 (file)
@@ -151,7 +151,11 @@ CpvDeclare(void*, CmiLocalQueue);
 
 enum MACHINE_SMP_MODE {
     INVALID_MODE,
+#if CMK_BLUEGENEQ
+    COMM_THREAD_SEND_RECV = 1,
+#else 
     COMM_THREAD_SEND_RECV = 0,
+#endif
     COMM_THREAD_ONLY_RECV, /* work threads will do the send */
     COMM_WORK_THREADS_SEND_RECV, /* work and comm threads do the both send/recv */
     COMM_THREAD_NOT_EXIST /* work threads will do both send and recv */
@@ -374,7 +378,7 @@ void CmiPushPE(int rank,void *msg) {
     }
 #endif
 
-    PCQueuePush(cs->recv,msg);
+    PCQueuePush(cs->recv,(char*)msg);
 
 #if CMK_SHARED_VARS_POSIX_THREADS_SMP
   if (_Cmi_noprocforcommthread)
@@ -473,7 +477,7 @@ void CmiSyncSendFn(int destPE, int size, char *msg) {
 #include "machine-xpmem.c"
 #endif
 
-int refcount = 0;
+static int refcount = 0;
 
 #if CMK_USE_OOB
 CpvExtern(int, _urgentSend);
@@ -483,7 +487,7 @@ CpvExtern(int, _urgentSend);
 #if CMK_C_INLINE
 inline 
 #endif
-CmiCommHandle LrtsSendNetworkFunc(int destNode, int size, char *msg, int mode)
+CmiCommHandle CmiSendNetworkFunc(int destNode, int size, char *msg, int mode)
 {
         int rank;
 #if CMK_USE_PXSHM
@@ -500,6 +504,15 @@ CmiCommHandle LrtsSendNetworkFunc(int destNode, int size, char *msg, int mode)
           return 0;
         }
 #endif
+#if CMK_PERSISTENT_COMM
+        if (CpvAccess(phs)) {
+          if (size > PERSIST_MIN_SIZE) {
+            CmiAssert(CpvAccess(curphs) < CpvAccess(phsSize));
+            LrtsSendPersistentMsg(CpvAccess(phs)[CpvAccess(curphs)], destNode, size, msg);
+            return 0;
+          }
+        }
+#endif
 
 #if CMK_WITH_STATS
 if (MSG_STATISTIC)
@@ -521,29 +534,26 @@ void CmiFreeSendFn(int destPE, int size, char *msg) {
     if (CmiMyPe()==destPE) {
         CmiSendSelf(msg);
 #if CMK_PERSISTENT_COMM
-        if (phs) curphs++;
-#endif
-    } else {
-#if CMK_PERSISTENT_COMM
-        if (phs) {
-          if (size > 8192) {
-            CmiAssert(curphs < phsSize);
-            LrtsSendPersistentMsg(phs[curphs++], destPE, size, msg);
-            return;
-          }
-          else
-            curphs++;
-        }
+        if (CpvAccess(phs)) CpvAccess(curphs)++;
 #endif
+    } 
+    else {
         int destNode = CmiNodeOf(destPE);
+        int destRank = CmiRankOf(destPE);
 #if CMK_SMP
         if (CmiMyNode()==destNode) {
-            CmiPushPE(CmiRankOf(destPE), msg);
+            CmiPushPE(destRank, msg);
+#if CMK_PERSISTENT_COMM
+            if (CpvAccess(phs)) CpvAccess(curphs)++;
+#endif
             return;
         }
 #endif
-        CMI_DEST_RANK(msg) = CmiRankOf(destPE);
-        LrtsSendNetworkFunc(destNode, size, msg, P2P_SYNC);
+        CMI_DEST_RANK(msg) = destRank;
+        CmiSendNetworkFunc(destNode, size, msg, P2P_SYNC);
+#if CMK_PERSISTENT_COMM
+        if (CpvAccess(phs)) CpvAccess(curphs)++;
+#endif
     }
 }
 #endif
@@ -563,7 +573,7 @@ if (  MSG_STATISTIC)
         msg_histogram[ret_log]++;
 }
 #endif
-        return LrtsSendFunc(destPE, size, msg, P2P_ASYNC);
+        return CmiSendNetworkFunc(destPE, size, msg, P2P_ASYNC);
     }
 }
 #endif
@@ -603,8 +613,11 @@ if (  MSG_STATISTIC)
     msg_histogram[ret_log]++;
 }
 #endif
-        LrtsSendFunc(destNode, size, msg, P2P_SYNC);
+        CmiSendNetworkFunc(destNode, size, msg, P2P_SYNC);
     }
+#if CMK_PERSISTENT_COMM
+    if (CpvAccess(phs)) CpvAccess(curphs)++;
+#endif
 }
 #endif
 
@@ -622,7 +635,7 @@ if (  MSG_STATISTIC)
         msg_histogram[ret_log]++;
 }
 #endif
-        return LrtsSendFunc(destNode, size, msg, P2P_ASYNC);
+        return CmiSendNetworkFunc(destNode, size, msg, P2P_ASYNC);
     }
 }
 #endif
@@ -790,16 +803,23 @@ static void ConverseRunPE(int everReturn) {
        node barrier previously should take care of the node synchronization */
     _immediateReady = 1;
 
-    /* communication thread */
-    if (CmiMyRank() == CmiMyNodeSize()) {
+    if(CharmLibInterOperate) {
+       /* !!! Not considering SMP mode now */
+       /* TODO: make interoperability working in SMP!!! */
+       Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
+       CsdScheduler(-1);
+    } else {
+      /* communication thread */
+      if (CmiMyRank() == CmiMyNodeSize()) {
         Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
         while (1) CommunicationServerThread(5);
-    } else { /* worker thread */
+      } else { /* worker thread */
         if (!everReturn) {
-            Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
-            if (Cmi_usrsched==0) CsdScheduler(-1);
-            ConverseExit();
+          Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
+          if (Cmi_usrsched==0) CsdScheduler(-1);
+          ConverseExit();
         }
+      }
     }
 }
 /* ##### End of Functions Related with Machine Startup ##### */
@@ -839,7 +859,7 @@ extern void ConverseCommonExit();
 
 static void CommunicationServer(int sleepTime) {
 #if CMK_SMP
-    AdvanceCommunication(0);
+    AdvanceCommunication(1);
 
     if (commThdExit == CmiMyNodeSize()) {
         MACHSTATE(2, "CommunicationServer exiting {");
index bf628b59bb2db68642ead02ce94f6a4831e99401..d3427f6787640cebdd51e6b04420587720097bf4 100644 (file)
@@ -5,7 +5,10 @@ void LrtsPrepareEnvelope(char *msg, int size);
 
 /* The machine-specific send function */
 CmiCommHandle LrtsSendFunc(int destNode, int size, char *msg, int mode);
-CmiCommHandle LrtsSendNetworkFunc(int destNode, int size, char *msg, int mode);
+
+void LrtsSyncListSendFn(int npes, int *pes, int len, char *msg);
+CmiCommHandle LrtsAsyncListSendFn(int npes, int *pes, int len, char *msg);
+void LrtsFreeListSendFn(int npes, int *pes, int len, char *msg);
 
 #if CMK_PERSISTENT_COMM
 void LrtsSendPersistentMsg(PersistentHandle h, int destPE, int size, void *m);
index e698e6fe57010a525fe69604be51c01ce5d5fbe1..d3d33795c18f7536c94a3f898e16385e0d2e5c88 100644 (file)
@@ -239,7 +239,7 @@ void CmiInitPxshm(char **argv){
         SENDQSTARTSIZE = 32 * pxshmContext->nodesize;
 
         if (_Cmi_mynode == 0)
-            CmiPrintf("Charm++> pxshm enabled: %d cores per node, buffer size: %.1fMB\n", pxshmContext->nodesize, SHMBUFLEN/1024.0/1024.0);
+            printf("Charm++> pxshm enabled: %d cores per node, buffer size: %.1fMB\n", pxshmContext->nodesize, SHMBUFLEN/1024.0/1024.0);
 
 #if CMK_CRAYXE
         srand(getpid());
@@ -561,10 +561,13 @@ void setupSharedBuffers(){
                }
        }
 
-        if (CmiBarrier() == 0) {
-            freeSharedBuffers();
-            pxshm_freed = 1;
-        }
+#if CMK_SMP && CMK_CRAYXE
+        if (PMI_Barrier() != GNI_RC_SUCCESS) return;
+#else
+        if (CmiBarrier() != 0) return;
+#endif
+        freeSharedBuffers();
+        pxshm_freed = 1;
 }
 
 void allocBufNameStrings(char ***bufName){
index 75cb278702b41610b4617fd73c412b8603a013cb..1f94e81716ad6176fefa27263fe381f968301921 100644 (file)
@@ -297,6 +297,21 @@ CmiState CmiGetState() {
 #endif
 
 
+#if CMK_HAS_SPINLOCK && CMK_USE_SPINLOCK
+CmiNodeLock CmiCreateLock()
+{
+  CmiNodeLock lk = (CmiNodeLock)malloc(sizeof(pthread_spinlock_t));
+  _MEMCHECK(lk);
+  pthread_spin_init(lk, 0);
+  return lk;
+}
+
+void CmiDestroyLock(CmiNodeLock lk)
+{
+  pthread_spin_destroy(lk);
+  free((void*)lk);
+}
+#else
 CmiNodeLock CmiCreateLock()
 {
   CmiNodeLock lk = (CmiNodeLock)malloc(sizeof(pthread_mutex_t));
@@ -310,6 +325,7 @@ void CmiDestroyLock(CmiNodeLock lk)
   pthread_mutex_destroy(lk);
   free(lk);
 }
+#endif
 
 void CmiYield(void) { sched_yield(); }
 
@@ -476,8 +492,8 @@ static void CmiStartThreads(char **argv)
 
 static void CmiDestoryLocks()
 {
-  pthread_mutex_destroy(comm_mutex);
-  pthread_mutex_destroy(CmiMemLock_lock);
+  CmiDestroyLock(comm_mutex);
+  CmiDestroyLock(CmiMemLock_lock);
   CmiMemLock_lock = 0;
   pthread_mutex_destroy(&barrier_mutex);
 #ifdef CMK_NO_ASM_AVAILABLE
index 298449812c46c89125dc65ac5987e61e4e26e393..790a572dc6f8e8a57ce671f51ec335bd319c1199 100644 (file)
@@ -33,9 +33,7 @@ Heavily modified by Nikhil Jain 11/28/2011
 int cutOffPoints[] = {64,128,256,512,1024,2048,4096, 8192,16384,32768,
                       65536,131072,262144,524288,1048576,2097152,4194304,
                       8388608,16777216,33554432,67108864,134217728,268435456,
-                      536870912};
-
-
+                      536870912,1073741824};
 
 INLINE_KEYWORD int which_pow2(size_t size)
 {
@@ -72,6 +70,12 @@ INLINE_KEYWORD void fillblock(mempool_type *mptr,block_header *block_head,int po
   if(left < cutOffPoints[power]) {
     power--;
   }
+    
+  if(power == cutOffNum) {
+    CmiAbort("Mempool-requested slot is more than what mempool can provide as\
+    one chunk, increase cutOffNum and cutoffPoints in mempool\n");
+  }
+
 #if MEMPOOL_DEBUG
   CmiPrintf("Left is %d, Max power obtained is %d\n",left,power);
 #endif
@@ -307,7 +311,7 @@ void*  mempool_malloc(mempool_type *mptr, int size, int expand)
         CmiPrintf("Mempool-Did not get memory while expanding\n");
         return NULL;
       }
-
+    
       mptr->size += expand_size;
       current = (block_header*)pool; 
       tail->block_next = ((char*)current-(char*)mptr);
index 4aaa9d059d5b35080ea24fe1fe9b5f491ea3fd3d..ce1dc3eb67938c52f2330736db9933168c2364ee 100644 (file)
@@ -18,8 +18,31 @@ typedef CmiInt8   mem_handle_t;
 typedef void * (* mempool_newblockfn)(size_t *size, mem_handle_t *mem_hndl, int expand_flag);
 typedef void (* mempool_freeblock)(void *ptr, mem_handle_t mem_hndl);
 
-#define cutOffNum 24 
-
+#define cutOffNum 25 
+
+//given x as mptr get
+#define   MEMPOOL_GetBlockHead(x)   (block_header*)&(x->block_head)    
+//given x as block header, get ...
+#define   MEMPOOL_GetBlockSize(x)    (((block_header*)x)->size)
+#define   MEMPOOL_GetBlockMemHndl(x) (((block_header*)x)->mem_hndl)
+#define   MEMPOOL_GetBlockNext(x)    (((block_header*)x)->block_next)     
+//given x as user pointer, get mempool_header/slot_header
+#define   MEMPOOL_GetMempoolHeader(x,align) \
+                                  ((mempool_header*)((char*)(x)-align))
+//given x as mempool_header/slot_header, get ...
+#define   MEMPOOL_GetBlockPtr(x)    ((block_header*)(x->block_ptr))
+#define   MEMPOOL_GetMempoolPtr(x)  ((mempool_type*)(MEMPOOL_GetBlockPtr(x)->mptr))
+#define   MEMPOOL_GetSize(x)      (MEMPOOL_GetBlockPtr(x)->size)
+#define   MEMPOOL_GetMemHndl(x)   (MEMPOOL_GetBlockPtr(x)->mem_hndl)
+#define   MEMPOOL_GetMsgInRecv(x) (MEMPOOL_GetBlockPtr(x)->msgs_in_recv)
+#define   MEMPOOL_GetMsgInSend(x) (MEMPOOL_GetBlockPtr(x)->msgs_in_send)
+#define   MEMPOOL_IncMsgInRecv(x) (MEMPOOL_GetBlockPtr(x)->msgs_in_recv)++
+#define   MEMPOOL_DecMsgInRecv(x) (MEMPOOL_GetBlockPtr(x)->msgs_in_recv)--
+#define   MEMPOOL_IncMsgInSend(x) (MEMPOOL_GetBlockPtr(x)->msgs_in_send)++
+#define   MEMPOOL_DecMsgInSend(x) (MEMPOOL_GetBlockPtr(x)->msgs_in_send)--
+#define   MEMPOOL_GetSlotGNext(x)     (x->gnext)
+#define   MEMPOOL_GetSlotStatus(x)    (x->status)
+#define          MEMPOOL_GetSlotSize(x)      (cutOffPoints[x->size])
 struct block_header;
 struct mempool_type;
 
@@ -30,7 +53,9 @@ typedef struct slot_header_
   int                          size,status;  //status is 1 for free, 0 for used
   size_t               gprev,gnext;  //global slot list within a block
   size_t               prev,next;    //link list for freelists slots
+#if ! CMK_64BIT
   size_t                padding;      // fix for 32 bit machines
+#endif
 } slot_header;
 
 typedef struct used_header_
@@ -38,7 +63,9 @@ typedef struct used_header_
   struct block_header  *block_ptr;     // block_header
   int                  size,status;  //status is 1 for free, 0 for used
   size_t               gprev,gnext;  //global slot list within a block
+#if ! CMK_64BIT
   size_t                padding;      // fix for 32 bit machines
+#endif
 } used_header;
 
 typedef used_header mempool_header;
@@ -51,11 +78,13 @@ typedef struct block_header
   size_t              block_prev,block_next;   // offset to next memblock
   size_t              freelists[cutOffNum];
   struct mempool_type  *mptr;               // mempool_type
-  size_t              padding;              // fix for 32 bit machines
 #if CMK_CONVERSE_GEMINI_UGNI
   int                 msgs_in_send;
   int                 msgs_in_recv;
 #endif
+#if ! CMK_64BIT
+  size_t              padding;              // fix for 32 bit machines
+#endif
 } block_header;
 
 // only at beginning of first block of mempool, representing the mempool
index 2464bba7186b672a4b133d2a6ab6a3a8a30bcd3a..42c7a743fb9d884ddfb36a417c0dc05a278a0959 100644 (file)
 
 #include "machine-persistent.h"
 
-#define TABLESIZE  512
-PersistentSendsTable persistentSendsTable[TABLESIZE];
-int persistentSendsTableCount = 0;
-PersistentReceivesTable *persistentReceivesTableHead;
-PersistentReceivesTable *persistentReceivesTableTail;
-int persistentReceivesTableCount = 0;
+CpvDeclare(PersistentSendsTable *, persistentSendsTableHead);
+CpvDeclare(PersistentSendsTable *, persistentSendsTableTail);
+CpvDeclare(int, persistentSendsTableCount);
+CpvDeclare(PersistentReceivesTable *, persistentReceivesTableHead);
+CpvDeclare(PersistentReceivesTable *, persistentReceivesTableTail);
+CpvDeclare(int, persistentReceivesTableCount);
 
 /* Converse message type */
 typedef struct _PersistentRequestMsg {
   char core[CmiMsgHeaderSizeBytes];
   int requestorPE;
   int maxBytes;
-  PersistentHandle sourceHandlerIndex;
+  PersistentHandle sourceHandler;
 } PersistentRequestMsg;
 
 typedef struct _PersistentReqGrantedMsg {
@@ -36,8 +36,8 @@ typedef struct _PersistentReqGrantedMsg {
   void *slotFlagAddress[PERSIST_BUFFERS_NUM];
 */
   PersistentBuf    buf[PERSIST_BUFFERS_NUM];
-  PersistentHandle sourceHandlerIndex;
-  PersistentHandle destHandlerIndex;
+  PersistentHandle sourceHandler;
+  PersistentHandle destHandler;
 } PersistentReqGrantedMsg;
 
 typedef struct _PersistentDestoryMsg {
@@ -50,31 +50,16 @@ int persistentRequestHandlerIdx;
 int persistentReqGrantedHandlerIdx;
 int persistentDestoryHandlerIdx;
 
-PersistentHandle  *phs = NULL;
-int phsSize;
-int curphs = 0;
+CpvDeclare(PersistentHandle *, phs);
+CpvDeclare(int, phsSize);
+CpvDeclare(int, curphs);
 
 /******************************************************************************
      Utilities
 ******************************************************************************/
 
-void initSendSlot(PersistentSendsTable *slot)
-{
-  int i;
-  slot->used = 0;
-  slot->destPE = -1;
-  slot->sizeMax = 0;
-  slot->destHandle = 0; 
-#if 0
-  for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
-    slot->destAddress[i] = NULL;
-    slot->destSizeAddress[i] = NULL;
-  }
-#endif
-  memset(&slot->destBuf, 0, sizeof(PersistentBuf)*PERSIST_BUFFERS_NUM);
-  slot->messageBuf = 0;
-  slot->messageSize = 0;
-}
+extern void initRecvSlot(PersistentReceivesTable *slot);
+extern void initSendSlot(PersistentSendsTable *slot);
 
 void swapSendSlotBuffers(PersistentSendsTable *slot)
 {
@@ -94,20 +79,6 @@ void swapSendSlotBuffers(PersistentSendsTable *slot)
   }
 }
 
-void initRecvSlot(PersistentReceivesTable *slot)
-{
-  int i;
-#if 0
-  for (i=0; i<PERSIST_BUFFERS_NUM; i++) {
-    slot->messagePtr[i] = NULL;
-    slot->recvSizePtr[i] = NULL;
-  }
-#endif
-  memset(&slot->destBuf, 0, sizeof(PersistentBuf)*PERSIST_BUFFERS_NUM);
-  slot->sizeMax = 0;
-  slot->prev = slot->next = NULL;
-}
-
 void swapRecvSlotBuffers(PersistentReceivesTable *slot)
 {
   if (PERSIST_BUFFERS_NUM == 2) {
@@ -128,27 +99,33 @@ void swapRecvSlotBuffers(PersistentReceivesTable *slot)
 
 PersistentHandle getFreeSendSlot()
 {
-  int i;
-  if (persistentSendsTableCount == TABLESIZE) CmiAbort("persistentSendsTable full.\n");
-  persistentSendsTableCount++;
-  for (i=1; i<TABLESIZE; i++)
-    if (persistentSendsTable[i].used == 0) break;
-  return &persistentSendsTable[i];
+  PersistentSendsTable *slot = (PersistentSendsTable *)malloc(sizeof(PersistentSendsTable));
+  initSendSlot(slot);
+  if (CpvAccess(persistentSendsTableHead) == NULL) {
+    CpvAccess(persistentSendsTableHead) = CpvAccess(persistentSendsTableTail) = slot;
+  }
+  else {
+    CpvAccess(persistentSendsTableTail)->next = slot;
+    slot->prev = CpvAccess(persistentSendsTableTail);
+    CpvAccess(persistentSendsTableTail) = slot;
+  }
+  CpvAccess(persistentSendsTableCount)++;
+  return slot;
 }
 
 PersistentHandle getFreeRecvSlot()
 {
-  PersistentReceivesTable *slot = (PersistentReceivesTable *)CmiAlloc(sizeof(PersistentReceivesTable));
+  PersistentReceivesTable *slot = (PersistentReceivesTable *)malloc(sizeof(PersistentReceivesTable));
   initRecvSlot(slot);
-  if (persistentReceivesTableHead == NULL) {
-    persistentReceivesTableHead = persistentReceivesTableTail = slot;
+  if (CpvAccess(persistentReceivesTableHead) == NULL) {
+    CpvAccess(persistentReceivesTableHead) = CpvAccess(persistentReceivesTableTail) = slot;
   }
   else {
-    persistentReceivesTableTail->next = slot;
-    slot->prev = persistentReceivesTableTail;
-    persistentReceivesTableTail = slot;
+    CpvAccess(persistentReceivesTableTail)->next = slot;
+    slot->prev = CpvAccess(persistentReceivesTableTail);
+    CpvAccess(persistentReceivesTableTail) = slot;
   }
-  persistentReceivesTableCount++;
+  CpvAccess(persistentReceivesTableCount)++;
   return slot;
 }
 
@@ -171,21 +148,27 @@ PersistentHandle getFreeRecvSlot()
 
 PersistentHandle CmiCreatePersistent(int destPE, int maxBytes)
 {
-  PersistentHandle h = getFreeSendSlot();
+  PersistentHandle h;
+  PersistentSendsTable *slot;
 
-  PersistentSendsTable *slot = (PersistentSendsTable *)h;
+  if (CmiMyNode() == CmiNodeOf(destPE)) return NULL;
 
+/*
   if (CmiMyPe() == destPE) {
-    CmiAbort("CmiCreatePersistent Error: setting up persistent communication to the same processor is not allowed.");
+    CmiPrintf("[%d] CmiCreatePersistent Error>  setting up persistent communication to the same processor is not allowed.\n", CmiMyPe());
+    CmiAbort("CmiCreatePersistent");
   }
+*/
+
+  h = getFreeSendSlot();
+  slot = (PersistentSendsTable *)h;
 
-  slot->used = 1;
   slot->destPE = destPE;
   slot->sizeMax = maxBytes;
 
   PersistentRequestMsg *msg = (PersistentRequestMsg *)CmiAlloc(sizeof(PersistentRequestMsg));
   msg->maxBytes = maxBytes;
-  msg->sourceHandlerIndex = h;
+  msg->sourceHandler = h;
   msg->requestorPE = CmiMyPe();
 
   CmiSetHandler(msg, persistentRequestHandlerIdx);
@@ -194,6 +177,15 @@ PersistentHandle CmiCreatePersistent(int destPE, int maxBytes)
   return h;
 }
 
+/* for SMP */
+PersistentHandle CmiCreateNodePersistent(int destNode, int maxBytes)
+{
+    /* randomly pick one rank on the destination node is fine for setup.
+       actual message will be handled by comm thread anyway */
+  int pe = CmiNodeFirst(destNode) + rand()/RAND_MAX * CmiMyNodeSize();
+  return CmiCreatePersistent(pe, maxBytes);
+}
+
 static void persistentRequestHandler(void *env)
 {             
   PersistentRequestMsg *msg = (PersistentRequestMsg *)env;
@@ -218,8 +210,8 @@ static void persistentRequestHandler(void *env)
 #endif
   }
 
-  gmsg->sourceHandlerIndex = msg->sourceHandlerIndex;
-  gmsg->destHandlerIndex = h;
+  gmsg->sourceHandler = msg->sourceHandler;
+  gmsg->destHandler = getPersistentHandle(h, 1);
 
   CmiSetHandler(gmsg, persistentReqGrantedHandlerIdx);
   CmiSyncSendAndFree(msg->requestorPE,sizeof(PersistentReqGrantedMsg),gmsg);
@@ -231,11 +223,9 @@ static void persistentReqGrantedHandler(void *env)
 {
   int i;
 
-
   PersistentReqGrantedMsg *msg = (PersistentReqGrantedMsg *)env;
-  PersistentHandle h = msg->sourceHandlerIndex;
+  PersistentHandle h = msg->sourceHandler;
   PersistentSendsTable *slot = (PersistentSendsTable *)h;
-  CmiAssert(slot->used == 1);
 
   /* CmiPrintf("[%d] Persistent handler granted  h:%p\n", CmiMyPe(), h); */
 
@@ -247,10 +237,10 @@ static void persistentReqGrantedHandler(void *env)
     slot->destBuf[i] = msg->buf[i];
 #endif
   }
-  slot->destHandle = msg->destHandlerIndex;
+  slot->destHandle = msg->destHandler;
 
   if (slot->messageBuf) {
-    LrtsSendPersistentMsg(h, slot->destPE, slot->messageSize, slot->messageBuf);
+    LrtsSendPersistentMsg(h, CmiNodeOf(slot->destPE), slot->messageSize, slot->messageBuf);
     slot->messageBuf = NULL;
   }
   CmiFree(msg);
@@ -293,7 +283,6 @@ PersistentHandle CmiRegisterReceivePersistent(PersistentReq recvHand)
   PersistentHandle h = getFreeSendSlot();
 
   PersistentSendsTable *slot = (PersistentSendsTable *)h;
-  slot->used = 1;
   slot->destPE = recvHand.pe;
   slot->sizeMax = recvHand.maxBytes;
 
@@ -318,37 +307,39 @@ void persistentDestoryHandler(void *env)
 {             
   int i;
   PersistentDestoryMsg *msg = (PersistentDestoryMsg *)env;
-  PersistentHandle h = msg->destHandlerIndex;
+  PersistentHandle h = getPersistentHandle(msg->destHandlerIndex, 0);
   CmiAssert(h!=NULL);
   CmiFree(msg);
   PersistentReceivesTable *slot = (PersistentReceivesTable *)h;
 
-  persistentReceivesTableCount --;
+  CpvAccess(persistentReceivesTableCount) --;
   if (slot->prev) {
     slot->prev->next = slot->next;
   }
   else
-   persistentReceivesTableHead = slot->next;
+    CpvAccess(persistentReceivesTableHead) = slot->next;
   if (slot->next) {
     slot->next->prev = slot->prev;
   }
   else
-    persistentReceivesTableTail = slot->prev;
+    CpvAccess(persistentReceivesTableTail) = slot->prev;
 
   for (i=0; i<PERSIST_BUFFERS_NUM; i++) 
     if (slot->destBuf[i].destAddress) /*elan_CmiStaticFree(slot->messagePtr);*/
       PerFree((char*)slot->destBuf[i].destAddress);
 
-  CmiFree(slot);
+  clearRecvSlot(slot);
+
+  free(slot);
 }
 
 /* FIXME: need to buffer until ReqGranted message come back? */
 void CmiDestoryPersistent(PersistentHandle h)
 {
-  if (h == 0) CmiAbort("CmiDestoryPersistent: not a valid PersistentHandle\n");
+  if (h == NULL) return;
 
   PersistentSendsTable *slot = (PersistentSendsTable *)h;
-  CmiAssert(slot->destHandle != 0);
+  /* CmiAssert(slot->destHandle != 0); */
 
   PersistentDestoryMsg *msg = (PersistentDestoryMsg *)
                               CmiAlloc(sizeof(PersistentDestoryMsg));
@@ -358,23 +349,34 @@ void CmiDestoryPersistent(PersistentHandle h)
   CmiSyncSendAndFree(slot->destPE,sizeof(PersistentDestoryMsg),msg);
 
   /* free this slot */
-  initSendSlot(slot);
+  if (slot->prev) {
+    slot->prev->next = slot->next;
+  }
+  else
+    CpvAccess(persistentSendsTableHead) = slot->next;
+  if (slot->next) {
+    slot->next->prev = slot->prev;
+  }
+  else
+    CpvAccess(persistentSendsTableTail) = slot->prev;
+  free(slot);
 
-  persistentSendsTableCount --;
+  CpvAccess(persistentSendsTableCount) --;
 }
 
 
 void CmiDestoryAllPersistent()
 {
-  int i;
-  for (i=0; i<TABLESIZE; i++) {
-    if (persistentSendsTable[i].messageBuf) 
-      CmiPrintf("Warning: CmiDestoryAllPersistent destoried buffered unsend message.\n");
-    initSendSlot(&persistentSendsTable[i]);
+  PersistentSendsTable *sendslot = CpvAccess(persistentSendsTableHead);
+  while (sendslot) {
+    PersistentSendsTable *next = sendslot->next;
+    free(sendslot);
+    sendslot = next;
   }
-  persistentSendsTableCount = 0;
+  CpvAccess(persistentSendsTableHead) = CpvAccess(persistentSendsTableTail) = NULL;
+  CpvAccess(persistentSendsTableCount) = 0;
 
-  PersistentReceivesTable *slot = persistentReceivesTableHead;
+  PersistentReceivesTable *slot = CpvAccess(persistentReceivesTableHead);
   while (slot) {
     PersistentReceivesTable *next = slot->next;
     int i;
@@ -383,16 +385,17 @@ void CmiDestoryAllPersistent()
         CmiPrintf("Warning: CmiDestoryAllPersistent destoried buffered undelivered message.\n");
       if (slot->destBuf[i].destAddress) PerFree((char*)slot->destBuf[i].destAddress);
     }
-    CmiFree(slot);
+    free(slot);
     slot = next;
   }
-  persistentReceivesTableHead = persistentReceivesTableTail = NULL;
-  persistentReceivesTableCount = 0;
+  CpvAccess(persistentReceivesTableHead) = CpvAccess(persistentReceivesTableTail) = NULL;
+  CpvAccess(persistentReceivesTableCount) = 0;
 }
 
 void CmiPersistentInit()
 {
   int i;
+
   persistentRequestHandlerIdx = 
        CmiRegisterHandler((CmiHandler)persistentRequestHandler);
   persistentReqGrantedHandlerIdx = 
@@ -400,30 +403,47 @@ void CmiPersistentInit()
   persistentDestoryHandlerIdx = 
        CmiRegisterHandler((CmiHandler)persistentDestoryHandler);
 
+  CpvInitialize(PersistentHandle*, phs);
+  CpvAccess(phs) = NULL;
+  CpvInitialize(int, phsSize);
+  CpvInitialize(int, curphs);
+  CpvAccess(curphs) = 0;
+
   persist_machine_init();
 
-  for (i=0; i<TABLESIZE; i++) {
-    initSendSlot(&persistentSendsTable[i]);
-  }
-  persistentSendsTableCount = 0;
-  persistentReceivesTableHead = persistentReceivesTableTail = NULL;
-  persistentReceivesTableCount = 0;
+  CpvInitialize(PersistentSendsTable *, persistentSendsTableHead);
+  CpvInitialize(PersistentSendsTable *, persistentSendsTableTail);
+  CpvAccess(persistentSendsTableHead) = CpvAccess(persistentSendsTableTail) = NULL;
+  CpvInitialize(int, persistentSendsTableCount);
+  CpvAccess(persistentSendsTableCount) = 0;
+
+  CpvInitialize(PersistentReceivesTable *, persistentReceivesTableHead);
+  CpvInitialize(PersistentReceivesTable *, persistentReceivesTableTail);
+  CpvAccess(persistentReceivesTableHead) = CpvAccess(persistentReceivesTableTail) = NULL;
+  CpvInitialize(int, persistentReceivesTableCount);
+  CpvAccess(persistentReceivesTableCount) = 0;
 }
 
 
 void CmiUsePersistentHandle(PersistentHandle *p, int n)
 {
   if (n==1 && *p == NULL) { p = NULL; n = 0; }
-#if  CMK_ERROR_CHECKING
+#if  CMK_ERROR_CHECKING && 0
   {
   int i;
   for (i=0; i<n; i++)
     if (p[i] == NULL) CmiAbort("CmiUsePersistentHandle: invalid PersistentHandle.\n");
   }
 #endif
-  phs = p;
-  phsSize = n;
-  curphs = 0;
+  CpvAccess(phs) = p;
+  CpvAccess(phsSize) = n;
+  CpvAccess(curphs) = 0;
+}
+
+void CmiPersistentOneSend()
+{
+  if (CpvAccess(phs)) CpvAccess(curphs)++;
 }
 
 #endif
index f3ff6d784015f69bc8734a706097e80dfbb5a6a1..05499e48f0176300ec36f16a9cde9ae8b3cd7bc6 100644 (file)
@@ -1006,9 +1006,13 @@ if(CpvAccess(networkProgressCount) >=  p)  \
 #endif
 
 
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
+#if defined(_FAULT_MLOG_) 
 #include "ckmessagelogging.h"
 #endif
+#if defined(_FAULT_CAUSAL_)
+#include "ckcausalmlog.h"
+#endif
+
 #include "ckmemcheckpoint.h"
 #include "readonly.h"
 #include "ckarray.h"
index af82a3d93f53d0afd7cac6d159e936f360b5d1e1..a03fee8a3b2e2afb09756bf4dd89df33bf55bf8e 100644 (file)
@@ -662,6 +662,7 @@ void CkCreateChare(int cIdx, int eIdx, void *msg, CkChareID *pCid, int destPE)
 #endif
   }
   env->setEpIdx(eIdx);
+  env->setByPe(CkMyPe());
   env->setSrcPe(CkMyPe());
   CmiSetHandler(env, _charmHandlerIdx);
   _TRACE_CREATION_1(env);
@@ -921,7 +922,7 @@ static void _processNewVChareMsg(CkCoreState *ck,envelope *env)
   // pCid->magic = _GETIDX(_entryTable[env->getEpIdx()]->chareIdx);
   register envelope *ret = UsrToEnv(pCid);
   ret->setVidPtr(env->getVidPtr());
-  register int srcPe = env->getSrcPe();
+  register int srcPe = env->getByPe();
   ret->setSrcPe(CkMyPe());
   CmiSetHandler(ret, _charmHandlerIdx);
   CmiSyncSendAndFree(srcPe, ret->getTotalsize(), (char *)ret);
@@ -1377,6 +1378,9 @@ void _skipCldEnqueue(int pe,envelope *env, int infoFn)
     CqsEnqueueGeneral((Queue)CpvAccess(CsdSchedQueue),
        env, env->getQueueing(),env->getPriobits(),
        (unsigned int *)env->getPrioPtr());
+#if CMK_PERSISTENT_COMM
+    CmiPersistentOneSend();
+#endif
   } else {
     if (pe < 0 || CmiNodeOf(pe) != CmiMyNode())
       CkPackMessage(&env);
@@ -1660,8 +1664,10 @@ static inline void _sendMsgBranch(int eIdx, void *msg, CkGroupID gID,
 {
   int numPes;
   register envelope *env = _prepareMsgBranch(eIdx,msg,gID,ForBocMsg);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
+#if defined(_FAULT_MLOG_) 
   sendTicketGroupRequest(env,pe,_infoIdx);
+#elif defined(_FAULT_CAUSAL_)
+       sendGroupMsg(env,pe,_infoIdx);
 #else
   _TRACE_ONLY(numPes = (pe==CLD_BROADCAST_ALL?CkNumPes():1));
   _TRACE_CREATION_N(env, numPes);
@@ -1808,8 +1814,10 @@ static inline void _sendMsgNodeBranch(int eIdx, void *msg, CkGroupID gID,
 {
   int numPes;
   register envelope *env = _prepareMsgBranch(eIdx,msg,gID,ForNodeBocMsg);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
+#if defined(_FAULT_MLOG_)
         sendTicketNodeGroupRequest(env,node,_infoIdx);
+#elif defined(_FAULT_CAUSAL_)
+       sendNodeGroupMsg(env,node,_infoIdx);
 #else
   numPes = (node==CLD_BROADCAST_ALL?CkNumNodes():1);
   _TRACE_CREATION_N(env, numPes);
@@ -1987,8 +1995,10 @@ extern "C"
 void CkArrayManagerDeliver(int pe,void *msg, int opts) {
   register envelope *env = UsrToEnv(msg);
   _prepareOutgoingArrayMsg(env,ForArrayEltMsg);
-#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
+#if defined(_FAULT_MLOG_)
    sendTicketArrayRequest(env,pe,_infoIdx);
+#elif defined(_FAULT_CAUSAL_)
+       sendArrayMsg(env,pe,_infoIdx);
 #else
   if (opts & CK_MSG_IMMEDIATE)
     CmiBecomeImmediate(env);
index bebbc898c1426b307ffbdba22418ab8e1a3ce07f..39eb213241badb49ba5d5a53ff145d14eaa1b35e 100644 (file)
@@ -11,6 +11,7 @@
 #include "stats.h"
 #include "ckfutures.h"
 #include "charisma.h"
+#include "TopoManager.h"
 
 #if CMK_ERROR_CHECKING
 #define _CHECK_VALID(p, msg) do {if((p)==0){CkAbort(msg);}} while(0)
index 5b65d126375a48d430fdc120bbaa5a2c4c1ef575..65dd8a1eb64c85b9cdeb1d21e712db1882088d04 100644 (file)
@@ -61,6 +61,7 @@ Orion Sky Lawlor, olawlor@acm.org
 CpvDeclare(int ,serializer);
 
 bool _isAnytimeMigration;
+bool _isStaticInsertion;
 bool _isNotifyChildInRed;
 
 #define ARRAY_DEBUG_OUTPUT 0
@@ -547,7 +548,7 @@ void CkArrayOptions::init()
 {
     locMgr.setZero();
     anytimeMigration = _isAnytimeMigration;
-    staticInsertion = false;
+    staticInsertion = _isStaticInsertion;
     reductionClient.type = CkCallback::invalid;
     disableNotifyChildInRed = !_isNotifyChildInRed;
     broadcastViaScheduler = false;
@@ -752,6 +753,40 @@ CkArray::CkArray(CkArrayOptions &opts,
   //nodeProxy = new CProxy_CkArrayReductionMgr (nodereductionID);
 #endif
 
+#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
+       // creating the spanning tree to be used for broadcast
+       children = (int *) CmiAlloc(sizeof(int) * _MLOG_BCAST_BFACTOR_);
+       numChildren = 0;
+       
+       // computing the level of the tree this pe is in
+       // we should use the geometric series formula, but now a quick and dirty code should suffice
+       // PE 0 is at level 0, PEs 1.._MLOG_BCAST_BFACTOR_ are at level 1 and so on
+       int level = 0;
+       int aux = CmiMyPe();
+       int max = CmiNumPes();
+       int factor = _MLOG_BCAST_BFACTOR_;
+       int startLevel = 0;
+       int startNextLevel = 1;
+       while(aux >= 0){
+               level++;
+               startLevel = startNextLevel;
+               startNextLevel += factor;
+               aux -= factor;
+               factor *= _MLOG_BCAST_BFACTOR_;
+       }
+
+       // adding children to the tree
+       int first = startNextLevel + (CmiMyPe() - startLevel) * _MLOG_BCAST_BFACTOR_;
+       for(int i=0; i<_MLOG_BCAST_BFACTOR_; i++){
+               if(first + i >= CmiNumPes())
+                       break;
+               children[i] = first + i;
+               numChildren++;
+       }
+#endif
+
+
   if (opts.reductionClient.type != CkCallback::invalid && CkMyPe() == 0)
       ckSetReductionClient(&opts.reductionClient);
 }
@@ -1241,8 +1276,19 @@ void CkArray::sendBroadcast(CkMessage *msg)
 {
        CK_MAGICNUMBER_CHECK
        if(CkMyPe() == CpvAccess(serializer)){
+#if _MLOG_BCAST_TREE_
+               // Using the spanning tree to broadcast the message
+               for(int i=0; i<numChildren; i++){
+                       CkMessage *copyMsg = (CkMessage *) CkCopyMsg((void **)&msg);
+                       thisProxy[children[i]].recvBroadcastViaTree(copyMsg);
+               }
+       
+               // delivering message locally
+               recvBroadcast(msg);     
+#else
                //Broadcast the message to all processors
                thisProxy.recvBroadcast(msg);
+#endif
        }else{
                thisProxy[CpvAccess(serializer)].sendBroadcast(msg);
        }
@@ -1257,6 +1303,21 @@ void CkArray::sendExpeditedBroadcast(CkMessage *msg)
 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
 int _tempBroadcastCount=0;
 
+// Delivers a message using the spanning tree
+void CkArray::recvBroadcastViaTree(CkMessage *msg)
+{
+       CK_MAGICNUMBER_CHECK
+
+       // Using the spanning tree to broadcast the message
+       for(int i=0; i<numChildren; i++){
+               CkMessage *copyMsg = (CkMessage *) CkCopyMsg((void **)&msg);
+               thisProxy[children[i]].recvBroadcastViaTree(copyMsg);
+       }
+
+       // delivering message locally
+       recvBroadcast(msg);     
+}
+
 void CkArray::broadcastHomeElements(void *data,CkLocRec *rec,CkArrayIndex *index){
     if(homePe(*index)==CmiMyPe()){
         CkArrayMessage *bcast = (CkArrayMessage *)data;
@@