merging with main branch
authorHarshitha <gplkrsh2@illinois.edu>
Sat, 10 Mar 2012 23:25:29 +0000 (17:25 -0600)
committerHarshitha <gplkrsh2@illinois.edu>
Sat, 10 Mar 2012 23:25:29 +0000 (17:25 -0600)
572 files changed:
.gitignore
CHANGES
README
build
doc/.gitignore
doc/charm++/arrays.tex
doc/charm++/order.tex
doc/charm++/pup.tex
doc/converse/code/cth-tutorial/Makefile [new file with mode: 0644]
doc/converse/code/cth-tutorial/pgm.C [new file with mode: 0644]
doc/converse/code/msgs/Makefile [new file with mode: 0644]
doc/converse/code/msgs/interNodeMsg.C [new file with mode: 0644]
doc/converse/code/pingpong.pseudo [new file with mode: 0644]
doc/converse/tutorial.tex [new file with mode: 0644]
doc/install/run.tex
examples/armci/putTest/Makefile
examples/armci/putTest/put.c
examples/bigsim/charm/jacobi2d/jacobi2d.C
examples/bigsim/tools/Makefile
examples/charm++/Molecular2D/Compute.C
examples/charm++/Molecular2D/Compute.h
examples/charm++/Molecular2D/Patch.C
examples/charm++/Molecular2D/Patch.h
examples/charm++/Molecular2D/common.h
examples/charm++/ckdirect/matmul3d/matmul3d.C
examples/charm++/ckdirect/matmul3d/matmul3d.h
examples/charm++/cuda/hello/Makefile
examples/charm++/cuda/hello/helloCUDA.cu
examples/charm++/cuda/overlapTestGPUManager/Makefile [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/Makefile with 65% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.C [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.C with 92% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.ci [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.ci with 100% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.cu [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.cu with 77% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTest.h [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.h with 100% similarity]
examples/charm++/cuda/overlapTestGPUManager/overlapTestConsts.h [moved from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTestConsts.h with 89% similarity]
examples/charm++/cuda/overlapTestStream/Makefile
examples/charm++/cuda/overlapTestStream/overlapTest.C
examples/charm++/cuda/overlapTestStream/overlapTest.cu
examples/charm++/cuda/overlapTestStream/overlapTestConsts.h
examples/charm++/gaussSeidel3D/gaussSeidel3d.C
examples/charm++/jacobi1d/jacobi1d.C
examples/charm++/load_balancing/kNeighbor/kNeighbor.C
examples/charm++/load_balancing/stencil3d/Makefile
examples/charm++/pupDisk/Makefile [new file with mode: 0644]
examples/charm++/pupDisk/README [new file with mode: 0644]
examples/charm++/pupDisk/pupDisk.C [new file with mode: 0644]
examples/charm++/pupDisk/pupDisk.ci [new file with mode: 0644]
examples/charm++/pupDisk/pupDisk.h [new file with mode: 0644]
examples/charm++/pupDisk/someData.h [new file with mode: 0644]
examples/charm++/ring/ring.C
examples/charm++/topology/jacobi2d/jacobi2d.C
examples/charm++/topology/jacobi3d/jacobi3d.C
examples/charm++/topology/matmul3d/matmul3d.C
examples/charm++/topology/matmul3d/matmul3d.h
examples/charm++/typed_reduction/Makefile
examples/charm++/typed_reduction/TypedReduction.cc
examples/converse/pingpong/pingpong.C
examples/multiphaseSharedArrays/Makefile_common
examples/multiphaseSharedArrays/histogram/Makefile [new file with mode: 0644]
examples/multiphaseSharedArrays/histogram/headers [new file with mode: 0644]
examples/multiphaseSharedArrays/histogram/histogram [new file with mode: 0755]
examples/multiphaseSharedArrays/histogram/histogram.C [new file with mode: 0644]
examples/multiphaseSharedArrays/histogram/histogram.ci [new file with mode: 0644]
examples/multiphaseSharedArrays/histogram/run.sh [new file with mode: 0755]
examples/pose/LBSim/edgelist.c
examples/pose/LBSim/generate.c
examples/pose/LBSim/topology.C
examples/pose/LBSim/topology.h
examples/pose/LBSim/typedefs.h
src/QuickThreads/copyright.h
src/QuickThreads/md/axp.h
src/QuickThreads/md/hppa.h
src/QuickThreads/md/i386.h
src/QuickThreads/md/ksr1.h
src/QuickThreads/md/m88k.c
src/QuickThreads/md/m88k.h
src/QuickThreads/md/mips.h
src/QuickThreads/md/mipspro.h
src/QuickThreads/md/setjmp.c
src/QuickThreads/md/setjmp64.c
src/QuickThreads/md/setjmp64_.c
src/QuickThreads/md/setjmp_b.c
src/QuickThreads/md/setjmp_d.h
src/QuickThreads/md/setjmp_m.c
src/QuickThreads/md/setjmp_u.h
src/QuickThreads/md/sjalloca.c
src/QuickThreads/md/sparc.h
src/QuickThreads/md/stub.c
src/QuickThreads/md/stub.h
src/QuickThreads/md/stub_b.c
src/QuickThreads/md/t3e.h
src/QuickThreads/md/vax.h
src/QuickThreads/meas.c
src/QuickThreads/qt.c
src/QuickThreads/qt.h
src/QuickThreads/qtb.h
src/QuickThreads/stp.c
src/QuickThreads/stp.h
src/arch/bluegenel/bglmachine.C
src/arch/bluegenel/conv-mach.h
src/arch/bluegenel/machine.c
src/arch/bluegenep/Makefile.machine
src/arch/bluegenep/cc-xlc.sh
src/arch/bluegenep/charmrun
src/arch/bluegenep/conv-mach.h
src/arch/bluegenep/machine.c
src/arch/common/conv-mach-gfortran.sh
src/arch/cuda/hybridAPI/Makefile
src/arch/cuda/hybridAPI/cuda-hybrid-api.cu
src/arch/elan-linux-ia64/conv-mach.h
src/arch/elan-linux/conv-mach.h
src/arch/gemini_gni-crayxe/conv-mach-pxshm.h
src/arch/gemini_gni-crayxe/conv-mach-smp.h
src/arch/gemini_gni-crayxe/conv-mach-xpmem.h [new file with mode: 0644]
src/arch/gemini_gni-crayxe/conv-mach-xpmem.sh [new file with mode: 0644]
src/arch/gemini_gni-crayxe/conv-mach.h
src/arch/gemini_gni/Makefile.machine
src/arch/gemini_gni/charmrun
src/arch/gemini_gni/conv-common.h
src/arch/gemini_gni/conv-mach-syncft.h [new file with mode: 0644]
src/arch/gemini_gni/conv-mach-syncft.sh [new file with mode: 0644]
src/arch/gemini_gni/cray_tlbhack.c [new file with mode: 0644]
src/arch/gemini_gni/machine-cmidirect.c [new file with mode: 0644]
src/arch/gemini_gni/machine.c
src/arch/lapi/conv-mach-smp.h
src/arch/lapi/conv-mach.h
src/arch/lapi/machine.c
src/arch/mpi-bluegenel/conv-mach.h
src/arch/mpi-bluegenep/conv-mach-smp.h [new file with mode: 0644]
src/arch/mpi-bluegenep/conv-mach-smp.sh [new file with mode: 0644]
src/arch/mpi-bluegenep/conv-mach.h
src/arch/mpi-bluegenep/conv-mach.sh
src/arch/mpi-bluegeneq/conv-mach.h
src/arch/mpi-crayxe/conv-mach-cuda.h [new file with mode: 0644]
src/arch/mpi-crayxe/conv-mach-cuda.sh [new file with mode: 0644]
src/arch/mpi-crayxe/conv-mach.h
src/arch/mpi-crayxe/special.sh [new file with mode: 0755]
src/arch/mpi-crayxt/conv-mach.h
src/arch/mpi-crayxt3/conv-mach.h
src/arch/mpi-cygwin/conv-mach.h
src/arch/mpi-darwin-ppc/conv-mach.h
src/arch/mpi-linux-ia64/conv-mach.h
src/arch/mpi-linux-mips64/conv-mach.h
src/arch/mpi-linux-ppc/conv-mach.h
src/arch/mpi-linux-x86_64/conv-mach-cuda.h [new file with mode: 0644]
src/arch/mpi-linux-x86_64/conv-mach-cuda.sh [new file with mode: 0644]
src/arch/mpi-linux-x86_64/conv-mach.h
src/arch/mpi-linux-x86_64/conv-mach.sh
src/arch/mpi-linux-x86_64/special.sh [new file with mode: 0755]
src/arch/mpi-linux/conv-mach.h
src/arch/mpi-sol-x86_64/conv-mach.h
src/arch/mpi-sol/conv-mach-smp.h
src/arch/mpi-sol/conv-mach.h
src/arch/mpi-sp/conv-mach-smp.h
src/arch/mpi-sp/conv-mach.h
src/arch/mpi-win32/conv-mach.h
src/arch/mpi-win64/conv-mach.h
src/arch/mpi/Makefile.machine
src/arch/mpi/machine.c
src/arch/multicore-aix-ppc/conv-mach.h
src/arch/multicore-cygwin/conv-mach.h
src/arch/multicore-darwin-x86/conv-mach.h
src/arch/multicore-darwin-x86_64/conv-mach.h
src/arch/multicore-linux-ppc/conv-mach.h
src/arch/multicore-linux32/conv-mach.h
src/arch/multicore-linux64/conv-mach.h
src/arch/multicore-win32/conv-mach.h
src/arch/multicore-win64/conv-mach.h
src/arch/net-aix-ppc/conv-mach.h
src/arch/net-cygwin/conv-mach.h
src/arch/net-darwin-ppc/conv-mach.h
src/arch/net-darwin-x86/conv-mach.h
src/arch/net-darwin-x86_64/conv-mach.h
src/arch/net-linux-amd64-cuda/conv-mach.h
src/arch/net-linux-cell/conv-mach.h
src/arch/net-linux-ia64/conv-mach.h
src/arch/net-linux-ppc/conv-mach.h
src/arch/net-linux-x86_64/conv-mach-cuda.h [new file with mode: 0644]
src/arch/net-linux-x86_64/conv-mach-cuda.sh [new file with mode: 0644]
src/arch/net-linux-x86_64/conv-mach-pedantic.sh
src/arch/net-linux-x86_64/special.sh [new file with mode: 0755]
src/arch/net-linux/conv-mach.h
src/arch/net-sol-x86/conv-mach.h
src/arch/net-sol-x86_64/conv-mach-smp.h
src/arch/net-sol-x86_64/conv-mach.h
src/arch/net-sol/conv-mach-smp.h
src/arch/net-sol/conv-mach.h
src/arch/net-win32/conv-mach.h
src/arch/net-win64/conv-mach.h
src/arch/net/Makefile.machine
src/arch/net/charmrun/daemon.c
src/arch/net/charmrun/daemon.h
src/arch/net/machine-ibverbs.c
src/arch/net/machine.c
src/arch/portals-crayxt3/conv-mach.h
src/arch/portals-crayxt3/machine.c
src/arch/shmem-crayxe/conv-mach.h
src/arch/shmem-crayxt/conv-mach.h
src/arch/shmem/conv-common.h
src/arch/shmem/machine.c
src/arch/sim-linux/conv-mach.h
src/arch/sim/ext_func.h
src/arch/sim/heap.c
src/arch/sim/machine.c
src/arch/sim/machine.h
src/arch/sim/net.c
src/arch/sim/sim.c
src/arch/sim/simqmng.c
src/arch/sim/simrand.c
src/arch/template/conv-mach.h
src/arch/template/machine-TEMPLATE.c
src/arch/template/machine.c
src/arch/uth-linux-x86_64/conv-mach.h
src/arch/uth-linux/conv-mach.h
src/arch/uth-win32/conv-mach.h
src/arch/uth/machine.c
src/arch/util/machine-broadcast.c
src/arch/util/machine-common-core.c
src/arch/util/machine-commthd-util.c [new file with mode: 0644]
src/arch/util/machine-lrts.h
src/arch/util/machine-pxshm.c
src/arch/util/machine-smp.c
src/arch/util/machine-xpmem.c [new file with mode: 0644]
src/arch/util/mempool.c
src/arch/util/mempool.h
src/arch/util/pcqueue.h
src/arch/vmi/conv-common.h
src/ck-com/ComlibManager.C
src/ck-core/charm++.h
src/ck-core/charm.h
src/ck-core/ck.C
src/ck-core/ck.h
src/ck-core/ckarray.C
src/ck-core/ckarray.h
src/ck-core/ckfutures.C
src/ck-core/cklocation.C
src/ck-core/ckmemcheckpoint.C
src/ck-core/ckmemcheckpoint.ci
src/ck-core/cknodegroupreduction.h [deleted file]
src/ck-core/ckreduction.C
src/ck-core/ckreduction.h
src/ck-core/cktiming.C
src/ck-core/debug-message.C
src/ck-core/envelope-path.h
src/ck-core/envelope.h
src/ck-core/init.C
src/ck-core/main.C
src/ck-core/middle-blue.h
src/ck-core/qd.C
src/ck-core/qd.h
src/ck-core/register.C
src/ck-core/register.h
src/ck-core/sdag.h
src/ck-core/stats.h
src/ck-core/tempo.C
src/ck-core/tempo.h
src/ck-core/waitqd.C
src/ck-core/waitqd.h
src/ck-ldb/BaseLB.C
src/ck-ldb/BaseLB.h
src/ck-ldb/CentralLB.C
src/ck-ldb/CentralPredictor.C
src/ck-ldb/GraphBFTLB.C
src/ck-ldb/GreedyLB.C
src/ck-ldb/LBDBManager.C
src/ck-ldb/LBDBManager.h
src/ck-ldb/LBDatabase.C
src/ck-ldb/LBDatabase.h
src/ck-ldb/LBMachineUtil.C
src/ck-ldb/LBMachineUtil.h
src/ck-ldb/LBOM.h
src/ck-ldb/LBObj.C
src/ck-ldb/LBObj.h
src/ck-ldb/LBProfit.C
src/ck-ldb/LBProfit.h
src/ck-ldb/LBSimulation.C
src/ck-ldb/LBSimulation.h
src/ck-ldb/RecBipartLB.C
src/ck-ldb/ScotchLB.C
src/ck-ldb/ScotchRefineLB.C
src/ck-ldb/ScotchTopoLB.C
src/ck-ldb/TreeMatchLB.C
src/ck-ldb/TreeMatchLB.h
src/ck-ldb/ZoltanLB.C [new file with mode: 0644]
src/ck-ldb/ZoltanLB.ci [new file with mode: 0644]
src/ck-ldb/ZoltanLB.h [new file with mode: 0644]
src/ck-ldb/bitvecset.c
src/ck-ldb/bitvecset.h
src/ck-ldb/ckgraph.C
src/ck-ldb/ckgraph.h
src/ck-ldb/lbdb.C
src/ck-ldb/lbdb.h
src/ck-ldb/tm_mapping.h
src/ck-ldb/tm_tree.h
src/ck-perf/trace-bluegene.C
src/ck-perf/trace-bluegene.h
src/ck-perf/trace-common.C
src/ck-perf/trace-common.h
src/ck-perf/trace-converse.c
src/ck-perf/trace-counter.C
src/ck-perf/trace-counter.h
src/ck-perf/trace-projections.C
src/ck-perf/trace-projections.h
src/ck-perf/trace-summary.C
src/ck-perf/trace-summary.h
src/ck-perf/trace-utilization.C
src/ck-perf/trace-utilization.h
src/ck-perf/trace.h
src/conv-ccs/ccs-client.c
src/conv-ccs/ccs-client.h
src/conv-ccs/conv-ccs.c
src/conv-ccs/conv-ccs.h
src/conv-com/3dgridrouter.h
src/conv-com/de.C
src/conv-com/de.h
src/conv-com/graphrouter.C
src/conv-com/graphrouter.h
src/conv-com/gridrouter.C
src/conv-com/gridrouter.h
src/conv-com/hypercuberouter.C
src/conv-com/hypercuberouter.h
src/conv-com/petable.C
src/conv-com/petable.h
src/conv-com/treerouter.C
src/conv-com/treerouter.h
src/conv-core/cmidirect.h
src/conv-core/cmipool.h
src/conv-core/conv-conds.c
src/conv-core/conv-config.h
src/conv-core/conv-trace.h
src/conv-core/convcore.c
src/conv-core/converse.h
src/conv-core/cpath.c
src/conv-core/cpm.c
src/conv-core/cpthreads.h
src/conv-core/cpuaffinity.c
src/conv-core/cputopology.C
src/conv-core/debug-conv.c
src/conv-core/futures.c
src/conv-core/global-elfcopy.C
src/conv-core/global-elfgot.C
src/conv-core/global-macho.C
src/conv-core/global-nop.c
src/conv-core/isomalloc.c
src/conv-core/mem-arena.c
src/conv-core/memory-gnuold.c
src/conv-core/memory-isomalloc.c
src/conv-core/memory.c
src/conv-core/msgcallbacks.c
src/conv-core/queueing.c
src/conv-core/queueing.h
src/conv-core/quiescence.c
src/conv-core/quiescence.h
src/conv-core/random.c
src/conv-core/threads.c
src/conv-ldb/cldb.bluegene.c
src/conv-ldb/cldb.neighbor.h
src/conv-ldb/cldb.none.c
src/conv-ldb/cldb.prioritycentralized.h
src/conv-ldb/cldb.rand.c
src/conv-ldb/cldb.spray.c
src/conv-ldb/cldb.workstealing.h
src/conv-ldb/edgelist.c
src/conv-ldb/generate.c
src/conv-ldb/graphdefs.h
src/conv-ldb/topology.C
src/conv-ldb/topology.h
src/langs/bluegene/bigsim_logs.h
src/langs/bluegene/bigsim_network.h
src/langs/bluegene/blue.C
src/langs/bluegene/blue.h
src/langs/charj/.gitignore
src/langs/charj/src/charj/translator/ArraySectionInitializer.java
src/langs/charj/src/charj/translator/AstModifier.java
src/langs/charj/src/charj/translator/CFGBuilder.g
src/langs/charj/src/charj/translator/CFGNode.java
src/langs/charj/src/charj/translator/Charj.g
src/langs/charj/src/charj/translator/Charj.stg
src/langs/charj/src/charj/translator/CharjAST.java
src/langs/charj/src/charj/translator/CharjEmitter.g
src/langs/charj/src/charj/translator/CharjPostAnalysis.g [moved from src/langs/charj/src/charj/translator/CharjASTModifier2.g with 98% similarity]
src/langs/charj/src/charj/translator/CharjPreAnalysis.g [moved from src/langs/charj/src/charj/translator/CharjASTModifier.g with 99% similarity]
src/langs/charj/src/charj/translator/ClassSymbol.java
src/langs/charj/src/charj/translator/ExternalSymbol.java
src/langs/charj/src/charj/translator/PackageScope.java
src/langs/charj/src/charj/translator/Translator.java
src/langs/charj/src/charj/translator/TypeName.java
src/langs/charj/tests/functional/ArrayTest.cj [moved from src/langs/charj/tests/unit/ArrayTest.cj with 100% similarity]
src/langs/charj/tests/functional/Assert.cj [moved from src/langs/charj/tests/unit/Assert.cj with 100% similarity]
src/langs/charj/tests/functional/ChareArrayBroadcast.cj [moved from src/langs/charj/tests/unit/ChareArrayBroadcast.cj with 100% similarity]
src/langs/charj/tests/functional/Conditional.cj [moved from src/langs/charj/tests/unit/Conditional.cj with 100% similarity]
src/langs/charj/tests/functional/Declarations.cj [moved from src/langs/charj/tests/unit/Declarations.cj with 100% similarity]
src/langs/charj/tests/functional/Empty.cj [moved from src/langs/charj/tests/unit/Empty.cj with 100% similarity]
src/langs/charj/tests/functional/EntryInvocation.cj [moved from src/langs/charj/tests/unit/EntryInvocation.cj with 100% similarity]
src/langs/charj/tests/functional/FieldModifiers.cj [moved from src/langs/charj/tests/unit/FieldModifiers.cj with 100% similarity]
src/langs/charj/tests/functional/FieldModifiersFailure.cj [moved from src/langs/charj/tests/unit/FieldModifiersFailure.cj with 100% similarity]
src/langs/charj/tests/functional/ForLoop.cj [moved from src/langs/charj/tests/unit/ForLoop.cj with 90% similarity]
src/langs/charj/tests/functional/FuncArgs.cj [moved from src/langs/charj/tests/unit/FuncArgs.cj with 100% similarity]
src/langs/charj/tests/functional/Label.cj [moved from src/langs/charj/tests/unit/Label.cj with 100% similarity]
src/langs/charj/tests/functional/Main.cj [moved from src/langs/charj/tests/unit/Main.cj with 100% similarity]
src/langs/charj/tests/functional/Makefile [moved from src/langs/charj/tests/unit/Makefile with 100% similarity]
src/langs/charj/tests/functional/ObjectField.cj [moved from src/langs/charj/tests/unit/ObjectField.cj with 100% similarity]
src/langs/charj/tests/functional/PrimitiveTypes.cj [moved from src/langs/charj/tests/unit/PrimitiveTypes.cj with 100% similarity]
src/langs/charj/tests/functional/ReservedWords.cj [moved from src/langs/charj/tests/unit/ReservedWords.cj with 100% similarity]
src/langs/charj/tests/functional/SDAG.cj [moved from src/langs/charj/tests/unit/SDAG.cj with 100% similarity]
src/langs/charj/tests/functional/SimpleChare.cj [moved from src/langs/charj/tests/unit/SimpleChare.cj with 100% similarity]
src/langs/charj/tests/functional/SimpleChareArray.cj [moved from src/langs/charj/tests/unit/SimpleChareArray.cj with 100% similarity]
src/langs/charj/tests/functional/SimpleClass.cj [moved from src/langs/charj/tests/unit/SimpleClass.cj with 100% similarity]
src/langs/charj/tests/functional/Switch.cj [moved from src/langs/charj/tests/unit/Switch.cj with 100% similarity]
src/langs/charj/tests/functional/TODO [moved from src/langs/charj/tests/unit/TODO with 100% similarity]
src/langs/charj/tests/functional/Template.cj [moved from src/langs/charj/tests/unit/Template.cj with 100% similarity]
src/langs/charj/tests/functional/WhileLoop.cj [moved from src/langs/charj/tests/unit/WhileLoop.cj with 100% similarity]
src/langs/charj/tests/leanmd/leanmd.cj [new file with mode: 0644]
src/langs/charj/tests/run_tests.sh
src/langs/pvmc/pvm3.h
src/langs/pvmc/pvmc.h
src/langs/pvmc/pvmc_buf.c
src/langs/pvmc/pvmc_comm.c
src/langs/pvmc/pvmc_conv.c
src/langs/pvmc/pvmc_groups.c
src/langs/pvmc/pvmc_main.c
src/langs/pvmc/pvmc_pack.c
src/langs/pvmc/pvmctest.c
src/langs/simplemsg/simplemsg.c
src/langs/simplemsg/simplemsg.h
src/langs/simplemsg/sm.c
src/langs/simplemsg/sm.h
src/langs/streams/Communicate.C
src/langs/streams/Communicate.h
src/langs/streams/MStream.C
src/langs/streams/MStream.h
src/libs/ck-libs/Makefile
src/libs/ck-libs/MeshStreamer/MeshStreamer.h
src/libs/ck-libs/NDMeshStreamer/DataItemTypes.h [new file with mode: 0644]
src/libs/ck-libs/NDMeshStreamer/Makefile [new file with mode: 0644]
src/libs/ck-libs/NDMeshStreamer/NDMeshStreamer.C [new file with mode: 0644]
src/libs/ck-libs/NDMeshStreamer/NDMeshStreamer.ci [new file with mode: 0644]
src/libs/ck-libs/NDMeshStreamer/NDMeshStreamer.h [new file with mode: 0644]
src/libs/ck-libs/ampi/ampi.C
src/libs/ck-libs/amr/fifo.c
src/libs/ck-libs/amr/fifo.h
src/libs/ck-libs/armci/armci_api.C
src/libs/ck-libs/armci/armci_impl.h
src/libs/ck-libs/armci/armci_vp.C
src/libs/ck-libs/barrier/barrier.C
src/libs/ck-libs/barrier/barrier.h
src/libs/ck-libs/collide/collidecharm.C
src/libs/ck-libs/collide/collidecharm_impl.h
src/libs/ck-libs/completion/Make.depends
src/libs/ck-libs/fem/fem.C
src/libs/ck-libs/fftlib/fftlib-normal.C
src/libs/ck-libs/fftlib/fftlib.h
src/libs/ck-libs/fftlib/rfftlib-normal.C
src/libs/ck-libs/io/Makefile [new file with mode: 0644]
src/libs/ck-libs/io/ckio.C [new file with mode: 0644]
src/libs/ck-libs/io/ckio.ci [new file with mode: 0644]
src/libs/ck-libs/io/ckio.h [new file with mode: 0644]
src/libs/ck-libs/irecv/receiver.C
src/libs/ck-libs/irecv/receiver.h
src/libs/ck-libs/mblock/mblock.C
src/libs/ck-libs/multicast/Make.depends
src/libs/ck-libs/multicast/ckmulticast.C
src/libs/ck-libs/multicast/ckmulticast.ci
src/libs/ck-libs/multicast/ckmulticast.h
src/libs/ck-libs/multiphaseSharedArrays/msa-DistPageMgr.h
src/libs/ck-libs/netfem/ParaviewConverter/.gitignore [new file with mode: 0644]
src/libs/ck-libs/pose/Make.depends
src/libs/ck-libs/pose/sim.C
src/libs/ck-libs/pose/sim.h
src/libs/ck-libs/pythonCCS/PythonCCS-client.h
src/libs/ck-libs/tcharm/Make.depends
src/libs/ck-libs/tcharm/tcharm.C
src/libs/conv-libs/lrpc/lrpc.c
src/libs/conv-libs/lrpc/lrpc.h
src/libs/conv-libs/master-slave/cms.c
src/libs/conv-libs/master-slave/cms.h
src/libs/conv-libs/packlib/PackLib.h
src/libs/conv-libs/packlib/cpacklib.h
src/libs/conv-libs/packlib/packc.C
src/libs/conv-libs/packlib/packf77.C
src/libs/conv-libs/packlib/packtest.C
src/scripts/Make.cidepends
src/scripts/Make.depends
src/scripts/Makefile
src/scripts/charmc
src/scripts/configure
src/scripts/configure.in
src/scripts/conv-autoconfig.h.in
src/util/cklists.h
src/util/cmitls.c
src/util/cmitls.h
src/util/graph.c
src/util/graph.h
src/util/simd.h
src/util/sockRoutines.c
src/util/sockRoutines.h
src/xlat-i/sdag/CEntry.C
src/xlat-i/sdag/CSdagConstruct.C
src/xlat-i/xi-symbol.C
src/xlat-i/xi-symbol.h
src/xlatcpm/conv-cpm.c
tests/charm++/array4d/hello.C
tests/charm++/broadcast/Makefile [new file with mode: 0644]
tests/charm++/broadcast/broadcast.C [new file with mode: 0644]
tests/charm++/broadcast/broadcast.ci [new file with mode: 0644]
tests/charm++/chkpt/hello.C
tests/charm++/ckAllocSysMsgTest/ckAllocSysMsgTest.C
tests/charm++/ckAllocSysMsgTest/ckAllocSysMsgTest.h
tests/charm++/commSpeed/Worker.h
tests/charm++/commSpeed/pgm.h
tests/charm++/commtest/comlib/bench.C
tests/charm++/commtest/comlib/benchmulti.C
tests/charm++/commtest/comlib/benchsectionmulti.C
tests/charm++/commtest/comlib/hello.C
tests/charm++/commtest/commlib_stream/hello.C
tests/charm++/commtest/commlib_stream/linear.C
tests/charm++/commtest/commlib_stream/manytomany.C
tests/charm++/commtest/pipeBroadcast/benchmark.h
tests/charm++/commtest/pipeBroadcast/test.h
tests/charm++/delegation/1darray/hello.C
tests/charm++/delegation/multicast/hello.C
tests/charm++/delegation/pipelined-section-reduction/hello.C
tests/charm++/hello-crosscorruption/hello.C
tests/charm++/io/Makefile [new file with mode: 0644]
tests/charm++/io/iotest.C [new file with mode: 0644]
tests/charm++/io/iotest.ci [new file with mode: 0644]
tests/charm++/load_balancing/lb_test/Makefile
tests/charm++/load_balancing/lb_test/Topo.C
tests/charm++/load_balancing/lb_test/Topo.h
tests/charm++/load_balancing/lb_test/lb_test.C
tests/charm++/load_balancing/lb_test/predictor/Makefile
tests/charm++/load_balancing/lb_test/predictor/test.h
tests/charm++/load_balancing/lb_test/sim/Topo.h
tests/charm++/load_balancing/lb_test/sim/sim.C
tests/charm++/megatest/arrayring.h
tests/charm++/megatest/fib.h
tests/charm++/megatest/groupcast.h
tests/charm++/megatest/groupmulti.h
tests/charm++/megatest/groupring.h
tests/charm++/megatest/groupsectiontest.h
tests/charm++/megatest/megatest.C
tests/charm++/megatest/migration.C
tests/charm++/megatest/migration.h
tests/charm++/megatest/multisectiontest.h
tests/charm++/megatest/nodecast.h
tests/charm++/megatest/nodering.h
tests/charm++/megatest/packtest.h
tests/charm++/megatest/priolongtest.h
tests/charm++/megatest/priomsg.h
tests/charm++/megatest/priotest.h
tests/charm++/megatest/queens.h
tests/charm++/megatest/rotest.h
tests/charm++/megatest/synctest.h
tests/charm++/megatest/templates.C
tests/charm++/megatest/templates.h
tests/charm++/megatest/tempotest.h
tests/charm++/megatest/varraystest.h
tests/charm++/megatest/varsizetest.h
tests/charm++/megatest/varsizetest2.h
tests/charm++/penciltest/testpencil.C
tests/charm++/pingpong/pingpong.C
tests/charm++/pmetest/patch.h
tests/charm++/pmetest/testpme.C
tests/charm++/sdag/migration/test1.C
tests/charm++/startupTest/startupTest.h
tests/charm++/topology/rtc.C
tests/converse/commbench/Makefile
tests/converse/commbench/broadcast.c [new file with mode: 0644]
tests/converse/commbench/commbench.c
tests/converse/commbench/memoryAccess.c
tests/converse/commbench/pingpong.c
tests/converse/commbench/reduction.c [new file with mode: 0644]
tests/util/Makefile

index 1bf04312efca86a8ee7555f025fa2f6a4c003b6c..c7526d07ee76414ca0ce5365f42ba2dee5b4bc01 100644 (file)
@@ -31,5 +31,4 @@ tmp
 # Note this is negated inside src/arch to permit addition of new charmrun files in there
 charmrun
 pgm
-lb_test
 *.swp
diff --git a/CHANGES b/CHANGES
index 99fbf327a548843139015161926b7d0e6c3e92ae..0821c467fa00d8f9d527fb5e5a8194eb87ba4b65 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,46 @@
 This file describes the most significant changes. For more detail, use
 'git log' on a clone of the charm repository.
 
+
+================================================================================
+What's new in Charm++ 6.4.0
+================================================================================
+
+--------------------------------------------------------------------------------
+Platform Support
+--------------------------------------------------------------------------------
+
+- Cray XE and XK systems using the Gemini network via either MPI
+  (mpi-crayxe) or the native uGNI (gemini_gni-crayxe)
+
+- IBM Blue Gene Q, using MPI (mpi-bluegeneq)
+
+- Fujitsu and Clang compilers
+
+- MPI-based machine layers can now run on >64k PEs
+
+--------------------------------------------------------------------------------
+General Changes
+--------------------------------------------------------------------------------
+
+- Added a new [reductiontarget] attribute to enable
+  parameter-marshaled recipients of reduction messages
+
+- Enabled pipelining of large messages in CkMulticast by default
+
+- New load balancers added: TreeMatch, Zoltan, Scotch{Refine,Topo}, RefineSwap
+
+- Load balancing improvements:
+
+  * Allow reduced load database size using floats instead of doubles
+  * Improved hierarchical balancer
+  * Periodic balancing adapts its interval dynamically
+  * User code can request a callback when migration is complete
+  * More balancers properly consider object migratability and PE
+    availability and speed
+
+- Array options
+
 ================================================================================
 What's new in Charm++ 6.2.1 (since 6.2.0)
 ================================================================================
diff --git a/README b/README
index 9c495458610620f47151ded8d91018e6040ba86d..0be12f5c37d274b2329935bb031cd61241aa9632 100644 (file)
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
-                           Charm++ CVS Version
+                           Charm++
 
-       Copyright (C) 1989-2000 Regents of the University of Illinois
+       Copyright (C) 1989-2012 Regents of the University of Illinois
 
 INTRODUCTION
 ============
@@ -14,23 +14,10 @@ Source code is provided, and non-commercial use is free.
 GETTING THE LATEST CHARM SOURCE
 ===============================
 
-You can use our anonymous cvs server to checkout the latest charm++ source code.
-(It may not be the latest stable version though) 
-What you need to do is as following:
+You can use anonymous Git access to obtain the latest Charm++ source
+code, as follows:
 
-1. login the cvs server:
-
-      cvs -d :pserver:checkout@charm.cs.illinois.edu:/cvsroot login
-
-      when CVS password is prompted, just press enter.
-2. checkout charm:
-
-      cvs co -P charm
-
-      You should get latest charm source tree.
-3. logout the cvs server:
-
-      cvs logout
+    git clone git://charm.cs.illinois.edu/charm.git
 
 
 PICKING A VERSION
@@ -40,9 +27,11 @@ First, you need to decide which version of charm++ to use. The "build"
 script in charm source directory takes several command line options to
 compile Charm++. The command line syntax is:
 
-build <target> <version> [options ...] [--basedir=dir] [--libdir=dir] [--incdir=dir] [charmc-options ...]
+build <target> <version> [options ...]
+                         [--basedir=dir] [--libdir=dir] [--incdir=dir]
+                         [charmc-options ...]
 
-for detailed help messages, use -h or --help to the build script, i.e.
+For detailed help messages, pass -h or --help to the build script, i.e.
 ./build --help
 
 REQUIRED:
@@ -132,9 +121,12 @@ development and testing.
        "mpi-" Charm++ communicates using MPI calls.  Use this for
 machines with a good MPI implementation (such as the Origin 2000).
 
-       "exemplar", "ncube-2", "paragon-red", "sp3", and "t3e" Charm++
+       "gemini_gni-", "bluegene[lpq]-", Charm++
 communicates using direct calls to the machine's communication primitives.
 
+       "multicore-" Charm++ communicates using shared memory within a
+       single node
+
        "sim-" and "uth-" are not actively maintained.  These are
 single-processor versions: "uth-" simulates processors as user-level
 threads; "sim-" switches between processors and counts communications.
@@ -142,30 +134,32 @@ threads; "sim-" switches between processors and counts communications.
 
 2.)  Your operating system:
 
-       "linux"   Linux 
-       "win32"   MS Windows with MS Visual C++ compiler
-       "cygwin"  MS Windows with Cygnus' Cygwin Unix layer
-       "irix"    SGI IRIX
-       "origin"  SGI Origin 2000 IRIX
-       "sol"     Solaris
-       "sun"     SunOS
-       "rs6k"    IBM R/S 6000 A/IX 
-       "sp"      IBM SP A/IX
-       "hp"      Hewlett-Packard HP-UX
-       "axp"     DEC Alpha DECUNIX
-       
-
-3.)  Some operating systems have other options, such as:
-       "-x86"     For Solaris, use PC hardware (instead of Sun).
-       "-axp"     For Linux, use Alpha hardware (instead of PC).
-        "-ia64"    Use Itanium(tm) instructions (instead of x86).
-        "-x86_64"  Use AMD Opteron instructions (instead of x86).
+       "linux"       Linux
+       "win{32,64}"  MS Windows with MS Visual C++ compiler (32/64-bit, resp.)
+       "cygwin"      MS Windows with Cygnus' Cygwin Unix layer
+       "darwin"      Apple Mac OS X
+       "sol"         Solaris
+       "aix"         IBM A/IX
+       "sp"          IBM SP A/IX
+
+
+3.)  Some operating systems have other architecture options, such as:
+
+       "-x86"     For Solaris and Mac OS X, target x86 hardware (instead of
+                  SPARC or PPC).
+       "-ppc"     POWER/PowerPC
+       "-mips64"  MIPS, such as for SiCortex systems
+        "-ia64"    Use Itanium(tm) IA-64 instructions (instead of x86).
+        "-x86_64"  Use AMD64/EM64T 64-bit x86 instructions (instead of 32 bit).
+       "-cell"    Sony/Toshiba/IBM Cell PPE (e.g. Playstation 3,
+                  Mercury blades, Roadrunner)
 
 Your Charm++ version is made by concatenating the options, e.g.:
 
-"net-linux"     Charm++ for a network of Linux workstations, compiled
-                using g++.
-"mpi-origin"    Charm++ for SGI Origin 2000, compiled using SGI CC.
+"net-linux-x86_64"   Charm++ for a network of 64-bit Linux workstations,
+                     compiled using g++.
+
+"mpi-crayxt"         Charm++ for Cray XT4/5 systems using the system's compiler.
 
 
 **** How to choose <options> ****
diff --git a/build b/build
index 55346a41249ef58dc9474b1e1df593f759f77709..176be31d9e0f77d72f2a53472109bc8aea032365 100755 (executable)
--- a/build
+++ b/build
@@ -31,9 +31,9 @@ syntax() {
   echo ''
   echo '<options>: compiler and platform specific options'
   echo 'cc cc64 cxx kcc pgcc acc icc ecc gcc3 gcc4 mpcc pathscale clang'
-  echo 'help smp gm tcp vmi scyld clustermatic bigemulator ooc syncft mlogft causalft papi'
-  echo 'pthreads lam'
-  echo '--incdir --libdir --basedir --no-build-shared --destination --suffix -j'
+  echo 'help smp gm tcp vmi scyld clustermatic bigemulator ooc syncft mlogft causalft'
+  echo 'papi pthreads lam'
+  echo '--incdir --libdir --basedir --build-shared --destination --suffix -j'
   if test $more = 1
   then
   echo ''
@@ -81,8 +81,8 @@ syntax() {
   echo '  papi            compile with PAPI performance counter support (if any)'
   echo ''
   echo "Charm++ dynamic libraries:"
-  echo "  --build-shared     build Charm++ dynamic libraries (.so) (default)"
-  echo "  --no-build-shared  don't build Charm++'s shared libraries"
+  echo "  --no-build-shared  don't build Charm++'s shared libraries (default)"
+  echo "  --build-shared     build Charm++ dynamic libraries (.so) "
   echo ''
   echo 'Enable/disable features:'
   src/scripts/configure --help | grep enable-
@@ -579,7 +579,7 @@ then
     echo '#define CMK_OPTIMIZE 1' >> $ConvHeader
     # Prepend optimize so that an explicit -no-optimize still works
     OPTS="-optimize -production $OPTS"
-    CONFIG_OPTS="--disable-controlpoint --disable-tracing --disable-charmdebug --disable-replay --disable-error-checking --disable-stats $CONFIG_OPTS"
+    CONFIG_OPTS="--disable-controlpoint --disable-tracing --disable-tracing-commthread --disable-charmdebug --disable-replay --disable-error-checking --disable-stats $CONFIG_OPTS"
 fi
 
 # build with Tau
index 2cd2f452e330ecabbedbb5118f7dcb2890d33336..b41b81be3da0f51537010d81da18011d5fee86ed 100644 (file)
@@ -1,2 +1,14 @@
+
+# Intermediate files from compiling documentation
+*.aux
+*.idx
+*.log
+*.out
+*.pdf
+*.toc
+*.ilg
+*/index.tex
+*/pplmanual.sty
+
 */.latex2html-init
 */pplmanual.tex
index af1e7c20294804691a45a5f376c20b237d7c6f7d..e57fe181988e09fc33ff23e7287f1c1adf6a000c 100644 (file)
@@ -299,8 +299,10 @@ home processor is the processor responsible for maintaining
 the location of the element.
 
 There is a default map object, which maps 1D array indices
-in a round-robin fashion to processors, and maps other array
-indices based on a hash function.
+in a block fashion to processors, and maps other array
+indices based on a hash function. Some other mappings such as round-robin
+(\kw{RRMap}) also exist, which can be used
+similar to custom ones described below.
 
 A custom map object is implemented as a group which inherits from
 \kw{CkArrayMap} and defines these virtual methods:
@@ -341,11 +343,11 @@ class BlockMap : public CkArrayMap
 Once you've instantiated a custom map object, you can use it to
 control the location of a new array's elements using the
 \kw{setMap} method of the \kw{CkArrayOptions} object described above.
-For example, if you've declared a map object named ``blockMap'':
+For example, if you've declared a map object named ``BlockMap'':
 
 \begin{alltt}
 //Create the map group
-  CProxy_blockMap myMap=CProxy_blockMap::ckNew();
+  CProxy_BlockMap myMap=CProxy_BlockMap::ckNew();
 //Make a new array using that map
   CkArrayOptions opts(nElements);
   opts.setMap(myMap);
index f63718080eab9348174f51bbd0c025d5c762b602..836c3197589834167f5cb4446f5ee65194d1f69f 100644 (file)
@@ -111,7 +111,7 @@ through {\tt CkEntryOptions::setPriority()}:
 \begin{alltt}
   CkEntryOptions opts;
   opts.setPriority(7);
-  chare.entry_name(arg1, arg2, opts);
+  chare.entry_name(arg1, arg2, &opts);
 \end{alltt}
 
 Bitvector priorities are somewhat more complicated.  Bitvector
index 38a458899be25e05304041fc8a67ae2c60f5ef05..e30ae60562fe7d0c62dfcfd7cfa2d256d722390b 100644 (file)
@@ -43,8 +43,8 @@ checkpoint and restart calculations involving \uw{foo}s.
 \subsubsection{PUP contract}
 
 Your object's pup routine must save and restore all your object's data.
-As shown, you save and restore a class's contents by writing a routine c
-alled ``pup'' which passes all the parts of the class to an object of type 
+As shown, you save and restore a class's contents by writing a routine
+called ``pup'' which passes all the parts of the class to an object of type
 \index{PUP::er} \kw{PUP::er}, which does the saving or restoring.  
 We often use ``pup'' as a verb, meaning ``to save/restore the value of''
 or equivalently, ``to call the pup routine of''.
diff --git a/doc/converse/code/cth-tutorial/Makefile b/doc/converse/code/cth-tutorial/Makefile
new file mode 100644 (file)
index 0000000..2a95a40
--- /dev/null
@@ -0,0 +1,21 @@
+CHARMC?=../../../../bin/charmc $(OPTS)
+
+LINKLINE=$(CHARMC) -o pgm pgm.o -language converse++
+
+all: pgm
+
+pgm: pgm.o
+       $(LINKLINE)
+
+pgm.o: pgm.C
+       $(CHARMC) -c pgm.C
+
+test: pgm
+       ./charmrun ./pgm +p1 $(TESTOPTS)
+#      -$(LINKLINE) -thread context && ./charmrun ./pgm +p1  $(TESTOPTS)&& ps -u `whoami`
+#      -$(LINKLINE) -thread pthreads -lpthread && ./charmrun ./pgm +p1  $(TESTOPTS)&& ps -u `whoami`
+#      -$(LINKLINE) -thread qt && ./charmrun ./pgm +p1  $(TESTOPTS)&& ps -u `whoami`
+#      -$(LINKLINE) -thread uJcontext && ./charmrun ./pgm +p1  $(TESTOPTS)&& ps -u `whoami`
+
+clean:
+       rm -f conv-host *.o pgm *.bak pgm.*.log pgm.sts *~ charmrun charmrun.exe pgm.exe pgm.pdb pgm.ilk
diff --git a/doc/converse/code/cth-tutorial/pgm.C b/doc/converse/code/cth-tutorial/pgm.C
new file mode 100644 (file)
index 0000000..4fca04f
--- /dev/null
@@ -0,0 +1,52 @@
+#include <stdio.h>
+#include "converse.h"
+
+#define HIGH_PRIO 0
+#define LOW_PRIO 1
+#define NUM_YIELD 10
+
+int endCounter = 0;
+
+//determine completion based on threads calling it
+void threadDone() {
+  endCounter++;
+  if (endCounter == 2) CsdExitScheduler();
+}
+
+//worker function for worker1, yields with a low priority
+void worker1Work(void* msg) {
+  printf("start worker1\n");
+  CthYield();
+  printf("worker1 resumed first time\n");
+  unsigned int prio  = LOW_PRIO;
+  for(int i = 0; i < NUM_YIELD; i++) {
+    CthYieldPrio(CQS_QUEUEING_IFIFO,0,&prio);
+    printf("worker1 resumed %dth time\n",i);
+  }
+  threadDone();
+}
+
+//worker function for worker2, yields with a high priority
+void worker2Work(void* msg) {
+  printf("start worker2\n");
+  CthYield();
+  printf("worker2 resumed first time\n");
+  unsigned int prio  = HIGH_PRIO;
+  for(int i = 0; i < NUM_YIELD; i++) {
+    CthYieldPrio(CQS_QUEUEING_IFIFO,0,&prio);
+    printf("worker2 resumed %dth time\n",i);
+  }
+  threadDone();
+}
+
+//create two worker threads and push them on scheduler Q
+void initThreads(int argc, char* argv[]) {
+  printf("called initThreads\n");
+  CthThread worker1 = CthCreateMigratable((CthVoidFn)worker1Work, 0, 160000);
+  CthThread worker2 = CthCreateMigratable((CthVoidFn)worker2Work, 0, 160000);
+  CthAwaken(worker1); CthAwaken(worker2);
+}
+
+int main(int argc, char* argv[]) {
+  ConverseInit(argc, argv, initThreads, 0, 0);
+}
diff --git a/doc/converse/code/msgs/Makefile b/doc/converse/code/msgs/Makefile
new file mode 100644 (file)
index 0000000..d31eddc
--- /dev/null
@@ -0,0 +1,19 @@
+CHARMC=../../../../bin/charmc $(OPTS)
+
+all: interNodeMsg 
+
+interNodeMsg: interNodeMsg.o
+       $(CHARMC) -language converse++ -o interNodeMsg interNodeMsg.o
+
+interNodeMsg.o: interNodeMsg.C
+       $(CHARMC) -language converse++ -c interNodeMsg.C
+
+
+test: interNodeMsg
+       ./charmrun ./interNodeMsg +p2 $(TESTOPTS)
+
+clean:
+       rm -f core *.cpm.h
+       rm -f TAGS *.o
+       rm -f interNodeMsg 
+       rm -f conv-host charmrun
diff --git a/doc/converse/code/msgs/interNodeMsg.C b/doc/converse/code/msgs/interNodeMsg.C
new file mode 100644 (file)
index 0000000..61d8823
--- /dev/null
@@ -0,0 +1,61 @@
+#include <stdlib.h>
+#include <converse.h>
+
+CpvDeclare(int,msgSize);
+CpvDeclare(int,userData);
+CpvDeclare(int,recvHandler);
+CpvDeclare(int,exitHandler);
+
+void sendData()
+{
+  //Allocate message
+  char *msg = (char *)CmiAlloc(CpvAccess(msgSize)+CmiMsgHeaderSizeBytes );
+  //set allocated space to contain user data
+  *((int *)(msg+CmiMsgHeaderSizeBytes)) =  CpvAccess(userData) ;
+  //set Handler
+  CmiSetHandler(msg,CpvAccess(recvHandler));
+  //Send Message
+  CmiSyncSendAndFree(0, CpvAccess(msgSize)+CmiMsgHeaderSizeBytes, msg);
+}
+//We finished for all message sizes. Exit now
+CmiHandler recvHandlerFunc(char *msg)
+{
+       int myData = *((int *)(msg+CmiMsgHeaderSizeBytes));
+       if (myData == CpvAccess(userData))
+                       CmiPrintf ("Received Expected Value\n");        
+    CmiFree(msg);
+       // Broadcast message 
+    void *sendmsg = CmiAlloc(CmiMsgHeaderSizeBytes);
+    CmiSetHandler(sendmsg,CpvAccess(exitHandler));
+    CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes,sendmsg);
+}
+// Exit now
+CmiHandler exitHandlerFunc(char *msg)
+{
+    CmiFree(msg);
+    CsdExitScheduler();
+    return 0;
+}
+
+//Converse main. Initialize variables and register handlers
+CmiStartFn mymain()
+{
+    CpvInitialize(int,msgSize);
+    CpvInitialize(int,userData);
+    CpvInitialize(int,recvHandler);
+    CpvInitialize(int,exitHandler);
+    CpvAccess(recvHandler) = CmiRegisterHandler((CmiHandler) recvHandlerFunc);
+    CpvAccess(exitHandler) = CmiRegisterHandler((CmiHandler) exitHandlerFunc);
+    CpvAccess(msgSize) = 4;
+    CpvAccess(userData) = 1454;
+    if (CmiMyPe() == 0)
+        sendData();
+    return 0;
+}
+
+int main(int argc,char *argv[])
+{
+    ConverseInit(argc,argv,(CmiStartFn)mymain,0,0);
+    return 0;
+}
diff --git a/doc/converse/code/pingpong.pseudo b/doc/converse/code/pingpong.pseudo
new file mode 100644 (file)
index 0000000..89a0022
--- /dev/null
@@ -0,0 +1,60 @@
+#include <converse.h>
+#include <stdlib.h>
+CpvDeclare(int,msgSize);
+CpvDeclare(int,exitHandler);
+CpvDeclare(int,node0Handler);
+CpvDeclare(int,node1Handler);
+void startRing()
+{
+  char *msg = (char *)CmiAlloc(CpvAccess(msgSize));
+  *((int *)(msg+CmiMsgHeaderSizeBytes)) = CpvAccess(msgSize);
+  CmiSetHandler(msg,CpvAccess(node1Handler));
+  CmiSyncSendAndFree(1, CpvAccess(msgSize), msg);
+}
+void ringFinished(char *msg)
+{
+  CmiFree(msg);
+  //exit
+  void *sendmsg = CmiAlloc(CmiMsgHeaderSizeBytes);
+  CmiSetHandler(sendmsg,CpvAccess(exitHandler));
+  CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes,sendmsg);
+}
+//We finished for all message sizes. Exit now
+CmiHandler exitHandlerFunc(char *msg)
+{
+    CmiFree(msg);
+    CsdExitScheduler();
+    return 0;
+}
+//Handler on Node 0
+CmiHandler node0HandlerFunc(char *msg)
+{
+       ringFinished(msg);
+       return 0;
+}
+CmiHandler node1HandlerFunc(char *msg)
+{
+    CpvAccess(msgSize) = *((int *)(msg+CmiMsgHeaderSizeBytes));
+    CmiSetHandler(msg,CpvAccess(node0Handler));
+    CmiSyncSendAndFree(0,CpvAccess(msgSize),msg);
+    return 0;
+}
+CmiStartFn mymain()
+{
+    CpvInitialize(int,msgSize);
+    CpvAccess(msgSize)= 512 + CmiMsgHeaderSizeBytes;
+    CpvInitialize(int,exitHandler);
+    CpvAccess(exitHandler) = CmiRegisterHandler((CmiHandler) exitHandlerFunc);
+    CpvInitialize(int,node0Handler);
+    CpvAccess(node0Handler) = CmiRegisterHandler((CmiHandler) node0HandlerFunc);
+    CpvInitialize(int,node1Handler);
+    CpvAccess(node1Handler) = CmiRegisterHandler((CmiHandler) node1HandlerFunc);
+    if (CmiMyPe() == 0)
+        startRing();
+    return 0;
+}
+int main(int argc,char *argv[])
+{
+    ConverseInit(argc,argv,(CmiStartFn)mymain,0,0);
+    return 0;
+}
diff --git a/doc/converse/tutorial.tex b/doc/converse/tutorial.tex
new file mode 100644 (file)
index 0000000..0b812da
--- /dev/null
@@ -0,0 +1,91 @@
+\documentclass[10pt]{report}
+\usepackage{../pplmanual}
+\input{../pplmanual}
+
+\title{\converse{}\\Programming\\Tutorial}
+\version{1.0}
+\credits{
+\converse{} Parallel Programming Environment was developed as a group
+effort at Parallel Programming Laboratory, University of Illinois at
+Urbana-Champaign.
+}
+\begin{document}
+\maketitle
+
+\chapter{Introduction}
+\lstdefinelanguage{pseudo}{
+morekeywords={if, else, for, in, remove, from, case, do, forever, to, False, True},
+sensitive=true,%
+morecomment=[l]\#,%
+morestring=[b]',%
+}
+
+\section{CthThreads}
+
+The CthThread package, like most thread packages, provides basic functionality
+for creating threads, destroying threads, yielding, suspending, and awakening a
+suspended thread. In addition, it provides facilities whereby you can write
+your own thread schedulers.
+
+Figure~\ref{fig:converse-cth} demonstrates how to write a simple program that
+creates CthThreads. The \texttt{CthCreateMigratable} is used and it takes a
+handler, an argument pointer, and the stack size for the thread. This is
+demonstrated in the \texttt{initThreads} function on line number 43. Once the
+threads are created, they are pushed on the scheduler queue with the
+\texttt{CthAwaken} call, which only takes the \texttt{CthThread} as an
+argument. On being scheduled, the handler function is called.
+
+In the example, each thread then calls \texttt{CthYield}, which directs control
+back to the scheduler and pushes the thread back onto the queue. Then in a
+loop, each thread calls \texttt{CthYieldPrio} \texttt{NUM\_YIELD} times, with
+the queuing strategy and necessary parameters. The threads call this with
+priority $0$ and $1$, lower integers (but non-negative) indicating higher
+priority. The effect of yielding with priority is that the higher priority
+thread on the queue has precedence over the other threads and hence will be
+scheduled first, based on the greedy decision the scheduler makes.
+
+After this loop completes, the \texttt{threadDone} is called by each
+thread, which increments a counter and quits the program when all threads are
+done.
+
+\begin{figure}
+\lstinputlisting[language=pseudo,basicstyle=\footnotesize,numbers=left,escapechar=\%]{code/pingpong.pseudo}
+\caption{A Pingpong Example using Converse Handler}
+\label{fig:converse-pingpong}
+\end{figure}
+
+\begin{figure}
+ \VerbatimInput[numbers=left,frame=single,firstline=1,lastline=53]{code/cth-tutorial/pgm.C}
+ \caption{CthThread Example}
+ \label{fig:converse-cth}
+\end{figure}
+
+
+\section{Interprocessor Messaging}
+
+Figure~\ref{fig:converse-msg} illustrates how to write a simple program that sends a message from one processor to other. In the example program, function \textttP{sendData} shows how to send a message. First, one must create a buffer to hold the message. The buffer must be large enough to hold the header and the data. This is done by \texttt{CmiAlloc} call. \texttt{CmiMsgHeaderSizeBytes} is a constant which contains the size of the header. Next, the handler method should be set for the outgoing message using \texttt{CmiSetHandler} call. User data can be set after the header. In this example, we use \texttt{CmiSyncSendAndFree} call to send the message. This function sends the message and frees the buffer. There are various alternatives. The first argument of this function call is the processor number where we want to send the message, second argument is the size of the message (including header) and the last argument is a pointer to the message.
+\begin{itemize}
+\item
+sync: a version that is as simple as possible, pushing the data into the network and not returning until the data is ``in the network''. As soon as a sync function returns, you can reuse the message buffer.
+
+\item
+async: a version that returns almost instantaneously, and then continues working in the background. The background job transfers the data from the message buffer into the network. Since the background job is still using the message buffer when the function returns, you can't reuse the message buffer immediately. The background job sets a flag when it is done and you can then reuse the message buffer.
+
+\item
+send and free: a version that returns almost instantaneously, and then continues working in the background. The background job transfers the data from the message buffer into the network. When the background job finishes, it CmiFrees the message buffer. In this situation, you can't reuse the message buffer at all. To use a function of this type, you must allocate the message buffer using CmiAlloc.
+
+\item
+node: a version that send a message to a node instead of a specific processor. This means that when the message is received, any ``free'' processor within than node can handle it. 
+\end{itemize}
+
+On the receiver, appropiate handler function is called and a pointer to the message (starting from the header) is provided to the user. User must free the message after using the data using \texttt{CmiFree} call. Finally we also illustrate a broadcast message using \texttt{CmiSyncBroadcastAllAndFree}
+
+TODO: Examples of other modes
+
+\begin{figure}
+\lstinputlisting[language=pseudo,basicstyle=\footnotesize,numbers=left,escapechar=\%]{code/interNodeMsg.C}
+\caption{Sending Message across Converse Processes}
+\label{fig:converse-pingpong}
+\end{figure}
+
+\end{document}
index 8934f1747c526d9ea8f32dd746b906af6928dea6..967417bfb9c18366cec0d48dc07d3ac61eb7c116 100644 (file)
@@ -21,6 +21,12 @@ processor in the debugger using:
 gdb pgm
 \end{alltt}
 
+If the program needs some environment variables
+to be set for its execution on compute nodes
+(such as library paths), they can be set in
+.charmrunrc under home directory. charmrun
+will run that shell script before running the executable.
+
 \subsection[Command Line Options]{Command Line Options}
 \label{command line options}
 \index{command line options}
index 20f8fb4fac5151feddfc230dd3391545d55ffab3..cf3f08b1a5af3c2a7878b533a700ced8cbfda9c1 100644 (file)
@@ -3,13 +3,13 @@ CHARMC=../../../bin/charmc $(OPTS)
 all: put
 
 put: put.c
-       $(CHARMC) -o put put.c -language armci 
+       $(CHARMC) -o put put.c -language armci -balancer RotateLB 
 
 test: put
-       ./charmrun +p2 ./put $(TESTOPTS)
+       ./charmrun +p2 ./put +vp2 $(TESTOPTS)
 
 bgtest: put
-       ./charmrun +p1 ./put +vp 2 +x2 +y1 +z1 $(TESTOPTS)
+       ./charmrun +p1 ./put +vp2 +x2 +y1 +z1 $(TESTOPTS)
 
 clean:
        rm -f put charmrun *.o charmrun.exe put.exe put.pdb put.ilk
index 8593ee2146913b2aa36c0f08fa59e34bfd713577..6bc1484346622fc2a9a2518c80424facbb516576 100644 (file)
@@ -26,7 +26,8 @@ int main(int argc, char * argv[]) {
   local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char));
 
   ARMCI_Barrier();
-   
+  ARMCI_Migrate();
+
   if (thisImage == 0) {
     for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){
       startTime = CkWallTimer();
@@ -42,7 +43,7 @@ int main(int argc, char * argv[]) {
     ARMCI_Barrier();
   }
 
-
+  
   ARMCI_Free(baseAddress[thisImage]);
   ARMCI_Free_local(local);
   // finalize
index 9d1f8f1adb6fd1a79b42f69186d9d5981a340520..75eeab3d917aadfb1a947f6299aa5199c8add8e1 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file jacobi2d.C
  *  Author: Abhinav S Bhatele
  *  Date Created: March 09th, 2009
index 17bd784db6b885112b11ad4d30393d2113bc66e3..7a926c5712dc4d301aaa0ca8e7615f42badc0b17 100644 (file)
@@ -1,3 +1,5 @@
+TEST_TOOLS=$(shell CHARMINC=../../../../include/; if test -f ./conv-config.sh; then . ./conv-config.sh; echo $$BGP_FLOOR; fi )
+
 DIRS= loadlog buildlog
 
 all:
@@ -5,10 +7,14 @@ all:
                (cd $$d; $(MAKE) all OPTS='$(OPTS)' || exit 1) || exit 1; \
        done
 
+ifeq "$(TEST_TOOLS)" ""
 test:
        for d in $(DIRS); do \
                (cd $$d; $(MAKE) test OPTS='$(OPTS)' || exit 1) || exit 1; \
        done
+else
+test:
+endif
 
 clean:
        for d in $(DIRS); do (cd $$d; $(MAKE) clean OPTS='$(OPTS)'); done
index ee5ae79d14cece4dada502494d5150a6caeb5b28..ce5f6f428d78c2c1544cd65be84fb6b3d1e1c606 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file Compute.h
  *  Author: Abhinav S Bhatele
  *  Date Created: July 1st, 2008
index ed778c71e7f8ee81718f93363fb5c74b12474b37..9646e1255cc83b34d6956e41af67d39b3eb67910 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file Compute.h
  *  Author: Abhinav S Bhatele
  *  Date Created: July 1st, 2008
index 808c0e30be38a0997756a1900e4cde060c93a4ab..6d7cc6477f08d402fd47d628796d13fc38517e85 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file Patch.C
  *  Author: Abhinav S Bhatele
  *  Date Created: July 1st, 2008
index 789b7d7e81e17ec38beec9c54d53fb81797e7c6e..4d578adeae4a01d84020efbb9dd817c5209c6bd5 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file Patch.h
  *  Author: Abhinav S Bhatele
  *  Date Created: July 1st, 2008
index dd46d3cb87a012ac455126a1d779460d365675f9..e576acc0d1d88f1417aaa0960f3d39c42ac573ad 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file common.h
  *  Author: Abhinav S Bhatele
  *  Date Created: July 1st, 2008
index aaa7de952dafc763daaa4476bcdf5980238a7767..efca98277fa4d332432fbde254da68cddb446ec3 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file matmul3d.C
  *  Author: Abhinav S Bhatele
  *  Date Created: April 01st, 2008
index be8c099fd040f4a56f41ba41bdcd11d6a4809199..1dda0888af7f5e519ee51bd8becf2fa43430795a 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file matmul3d.h
  *  Author: Abhinav S Bhatele
  *  Date Created: April 01st, 2008
index 071e8dd0d476c35b6c5c8440b29af50a23cf0749..dca7a6081c0b57f26108a1f35b3755f58fa33fa6 100644 (file)
@@ -4,11 +4,9 @@ OBJS = hello.o helloCUDA.o
 
 NVCC = /usr/local/cuda/bin/nvcc 
 NVCC_FLAGS = -c -use_fast_math #-device-debug -deviceemu 
-NVCC_INC = -I/usr/local/cuda/include -I../../../../../NVIDIA_CUDA_SDK/common/inc
-
-LD_LIBS += -lcuda -lcudart -lGL -lGLU -lcutil
-
-export LD_RUN_PATH = /usr/local/cuda/lib
+NVCC_INC = -I/usr/local/cuda/include 
+CHARMINC = -I../../../../include
+LD_LIBS += -lcuda -lcudart 
 
 all: hello
 
@@ -19,14 +17,13 @@ hello.decl.h: hello.ci
        $(CHARMC)  hello.ci
 
 clean:
-       rm -f *.decl.h *.def.h conv-host *.o wr.h hello charmrun
+       rm -f *.decl.h *.def.h conv-host *.o hello charmrun
 
 hello.o: hello.C hello.decl.h
-       $(CHARMC) -g -c hello.C
+       $(CHARMC) -c hello.C
 
 helloCUDA.o: helloCUDA.cu
-       cp ../../../../src/arch/cuda/hybridAPI/wr.h .
-       $(NVCC) -g $(NVCC_FLAGS) $(NVCC_INC) helloCUDA.cu
+       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) $(CHARMINC) helloCUDA.cu
 
 test: all
        ./charmrun hello +p4 10
index 69a15784366f1740ab7d522618c2b6a75d8cea7b..1e929788241a4bced05a6f3c2c18aab0a110795b 100644 (file)
@@ -9,32 +9,16 @@ __global__ void helloKernel() {
 }
 
 void kernelSetup(void *cb) {
-  workRequest *wr; 
-  wr = (workRequest*) malloc(sizeof(workRequest)); 
+  workRequest wr; 
+  wr.dimGrid = dim3(1, 1);
+  wr.dimBlock = dim3(1,1);
+  wr.smemSize = 0;
+  wr.nBuffers = 0; 
+  wr.bufferInfo = NULL;
+  wr.callbackFn = cb; 
+  wr.id = 0; 
 
-  wr->dimGrid.x = 1; 
-  wr->dimBlock.x = 1; 
-  wr->smemSize = 0;
-  
-  wr->readWriteDevicePtr = NULL;
-  wr->readWriteHostPtr = NULL; 
-  wr->readWriteLen = 0; 
-
-  wr->readOnlyDevicePtr = NULL;
-  wr->readOnlyHostPtr = NULL;
-  wr->readOnlyLen = 0; 
-
-  wr->writeOnlyDevicePtr = NULL;
-  wr->writeOnlyHostPtr = NULL;
-  wr->writeOnlyLen = 0; 
-
-  wr->callbackFn = cb; 
-
-  wr->id = 0; 
-
-  wr->executing = 0; 
-
-  enqueue(wrQueue, wr); 
+  enqueue(wrQueue, &wr); 
 
 }
 
@@ -43,7 +27,7 @@ void kernelSelect(workRequest *wr) {
   switch (wr->id) {
   case 0: 
     printf("calling kernel\n"); 
-    helloKernel<<<wr->dimGrid,wr->dimBlock,wr->smemSize>>>();
+    helloKernel<<<wr->dimGrid,wr->dimBlock,wr->smemSize, kernel_stream>>>();
     break;
   default:
     printf("error: id %d not valid\n", wr->id); 
similarity index 65%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/Makefile
rename to examples/charm++/cuda/overlapTestGPUManager/Makefile
index fa397a1a6d6ba19177108020311aacaa7a1f2c37..50f0e2c1b897286c3a04ad242f04064763f0f4b4 100644 (file)
@@ -1,17 +1,16 @@
-CHARMC=../../../../../net-linux-amd64-cuda/bin/charmc $(OPTS)
+CHARMC=../../../../bin/charmc $(OPTS)
 
 OBJS = overlapTest.o overlapTestCU.o
 
 NVCC = /usr/local/cuda/bin/nvcc 
 NVCC_FLAGS = -O3 -c -use_fast_math #-device-debug -deviceemu 
 NVCC_INC = -I/usr/local/cuda/include  -I../../../../../src/arch/cuda/hybridAPI
-
-export LD_RUN_PATH = /usr/local/cuda/lib64
+CHARMINC = -I../../../../include
 
 all: overlapTest
 
 overlapTest: $(OBJS)
-       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS) -tracemode projections
+       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS)
 
 overlapTest.decl.h: overlapTest.ci
        $(CHARMC)  overlapTest.ci
@@ -23,4 +22,7 @@ overlapTest.o: overlapTest.C overlapTest.decl.h
        $(CHARMC) -O3 -c overlapTest.C
 
 overlapTestCU.o: overlapTest.cu
-       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) -o overlapTestCU.o overlapTest.cu
+       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) $(CHARMINC) -o overlapTestCU.o overlapTest.cu
+
+test: all
+       ./charmrun overlapTest +p2 2 8
\ No newline at end of file
similarity index 92%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.C
rename to examples/charm++/cuda/overlapTestGPUManager/overlapTest.C
index 68830a1b3b19cb201cb7e022dbf8802dcbcebe43..7bdbcd57922ea4a457226833982a8bf6562eb942 100644 (file)
@@ -1,7 +1,7 @@
 #include "overlapTest.decl.h"
 #include "overlapTest.h"
 
-// #define DEBUG
+#define DEBUG
 
 extern void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C, int myIndex, void *cb); 
 extern void hostMemorySetup(int matrixSize, ElementType **h_A, ElementType **h_B, ElementType **h_C, void *cb); 
@@ -72,16 +72,6 @@ void Workers::beginWork() {
 void Workers::complete() {
   int size = matrixSize * matrixSize * sizeof(ElementType); 
   memcpy(C, h_C, size); 
-
-  for (int i=0; i<matrixSize; i++) {
-    for (int j=0; j<matrixSize; j++) {
-      C[i*matrixSize + j] = 0; 
-      for (int k=0; k<matrixSize; k++) {
-       C[i*matrixSize + j] += A[i*matrixSize +k] * B[k * matrixSize + j];
-      }
-    }
-  }
-
 #ifdef DEBUG
   CkPrintf("[%d] A\n", thisIndex); 
   for (int i=0; i<matrixSize; i++) {
similarity index 77%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTest.cu
rename to examples/charm++/cuda/overlapTestGPUManager/overlapTest.cu
index 4a00f57559aaf7db19f3812dc90409a6f700cec6..746369cd6d96515fc02476e10163e65d3b27a163 100644 (file)
@@ -1,16 +1,9 @@
 #include "overlapTestConsts.h"
 #include "wr.h"
 #include <stdio.h>
+#include <math.h>
 
-#if CHECK_BANK_CONFLICTS
-#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
-#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
-#else
-#define AS(i, j) As[i][j]
-#define BS(i, j) Bs[i][j]
-#endif
-
-// matrix multiplication code taken from the CUDA SDK
+// matrix multiplication code adapted from the CUDA SDK
 
 __global__ void
 matrixMul(float* C, float* A, float* B, int wA, int wB)
@@ -59,8 +52,8 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // Load the matrices from device memory
         // to shared memory; each thread loads
         // one element of each matrix
-        AS(ty, tx) = A[a + wA * ty + tx];
-        BS(ty, tx) = B[b + wB * ty + tx];
+        As[ty][tx] = A[a + wA * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];
 
         // Synchronize to make sure the matrices are loaded
         __syncthreads();
@@ -69,7 +62,7 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // each thread computes one element
         // of the block sub-matrix
         for (int k = 0; k < BLOCK_SIZE; ++k)
-            Csub += AS(ty, k) * BS(k, tx);
+            Csub += As[ty][k] * Bs[k][tx];
 
         // Synchronize to make sure that the preceding
         // computation is done before loading two new
@@ -113,11 +106,6 @@ void hostMemoryCleanup(ElementType *h_A, ElementType *h_B, ElementType *h_C) {
   delayedFree(h_B); 
   delayedFree(h_C);
 
-  /*
-  cudaFreeHost(h_A); 
-  cudaFreeHost(h_B); 
-  cudaFreeHost(h_C); 
-  */
 }
 
 void cudaMatMul(int matrixSize, ElementType *h_A, ElementType *h_B, 
@@ -127,7 +115,8 @@ void cudaMatMul(int matrixSize, ElementType *h_A, ElementType *h_B,
 
   workRequest matmul; 
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-  matmul.dimGrid = dim3(matrixSize / threads.x, matrixSize / threads.y);
+  matmul.dimGrid = dim3( ceil((float)matrixSize / threads.x), 
+                        ceil((float)matrixSize / threads.y) );
   matmul.dimBlock = dim3(BLOCK_SIZE, BLOCK_SIZE); 
   matmul.smemSize = 0; 
   matmul.nBuffers = 3; 
@@ -164,43 +153,6 @@ void cudaMatMul(int matrixSize, ElementType *h_A, ElementType *h_B,
   memcpy(matmul.userData, &matrixSize, sizeof(int)); 
 
   enqueue(wrQueue, &matmul); 
-
-  /*
-  cudaStream_t stream; 
-  cudaStreamCreate(&stream); 
-  ElementType *h_A, *h_B, *h_C; 
-  ElementType *d_A, *d_B, *d_C;
-
-
-  cudaMalloc((void **) &d_A, size);
-  cudaMalloc((void **) &d_B, size);
-  cudaMalloc((void **) &d_C, size);
-
-  cudaMemcpyAsync(d_A, A, size, cudaMemcpyHostToDevice, stream); 
-  cudaMemcpyAsync(d_B, B, size, cudaMemcpyHostToDevice, stream); 
-
-  dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-  dim3 grid(matrixSize / threads.x, matrixSize / threads.y);
-  
-  // execute the kernel
-  matrixMul<<< grid, threads, 0, stream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
-
-  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream); 
-
-  memcpy(C, h_C, size);
-
-  cudaStreamSynchronize(stream); 
-
-  cudaFreeHost(h_A);
-  cudaFreeHost(h_B);
-  cudaFreeHost(h_C);
-  
-  cudaFree(d_A);
-  cudaFree(d_B);
-  cudaFree(d_C);
-
-  cudaStreamDestroy(stream); 
-  */
 }
 
 void kernelSelect(workRequest *wr) {
similarity index 89%
rename from examples/charm++/cuda/gpuManager/overlapTestGPUManager/overlapTestConsts.h
rename to examples/charm++/cuda/overlapTestGPUManager/overlapTestConsts.h
index b304a88e265b43e3a644d2200de63ccb74d104c6..4e32906b305d024aef21f82c745252f813e1fe09 100644 (file)
@@ -3,7 +3,6 @@
 
 typedef float ElementType; 
 #define BLOCK_SIZE 8
-#define CHECK_BANK_CONFLICTS 0
 #define MATMUL_KERNEL 1000
 
 #define BUFFERS_PER_CHARE 3
index 235a7011b740010b3d80c4b8f7bcde87cb32340f..451dee09668d223b8b0a5efb71f6c2e56fb5cbf0 100644 (file)
@@ -1,19 +1,16 @@
-CHARMC=../../../../net-linux-amd64/bin/charmc $(OPTS)
+CHARMC=../../../../bin/charmc $(OPTS)
 
 OBJS = overlapTest.o overlapTestCU.o
 
 NVCC = /usr/local/cuda/bin/nvcc 
 NVCC_FLAGS = -O3 -c -use_fast_math #-device-debug -deviceemu 
-NVCC_INC = -I/usr/local/cuda/include -I../../../../NVIDIA_CUDA_SDK/common/inc -I../../charm/src/arch/cuda/hybridAPI/
-
-LD_LIBS += -lcuda -lcudart -lGL -lGLU -L../../../../NVIDIA_CUDA_SDK/lib
-
-#export LD_RUN_PATH = /usr/local/cuda/lib
-
+NVCC_INC = -I/usr/local/cuda/include 
+CHARMINC = -I../../../../include
+NVCC_LIBS = -L/usr/local/cuda/lib64 -lcuda -lcudart
 all: overlapTest
 
 overlapTest: $(OBJS)
-       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(LD_LIBS) -tracemode projections
+       $(CHARMC) -language charm++ -o overlapTest $(OBJS) $(NVCC_LIBS) 
 
 overlapTest.decl.h: overlapTest.ci
        $(CHARMC)  overlapTest.ci
@@ -25,4 +22,7 @@ overlapTest.o: overlapTest.C overlapTest.decl.h
        $(CHARMC) -O3 -c overlapTest.C
 
 overlapTestCU.o: overlapTest.cu
-       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) -o overlapTestCU.o overlapTest.cu
+       $(NVCC) $(NVCC_FLAGS) $(NVCC_INC) $(CHARMINC) -o overlapTestCU.o overlapTest.cu
+
+test: all
+       ./charmrun overlapTest +p2 2 8 
\ No newline at end of file
index b68368c1f3e0165cece41ff1c30c32336a51d069..a8a4d8ea058c193196a7fdb69f62e31165552dbf 100644 (file)
@@ -1,7 +1,7 @@
 #include "overlapTest.decl.h"
 #include "overlapTest.h"
 
-// #define DEBUG
+#define DEBUG
 
 extern void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C); 
 CProxy_Main mainProxy; 
index 58ad273b9ba37bca70b7b4502f5bd12c0220b629..dfd5ec308947d71c5c58db6a622545ad31573c1a 100644 (file)
@@ -1,13 +1,5 @@
 #include "overlapTestConsts.h"
 
-#if CHECK_BANK_CONFLICTS
-#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
-#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
-#else
-#define AS(i, j) As[i][j]
-#define BS(i, j) Bs[i][j]
-#endif
-
 // matrix multiplication code taken from the CUDA SDK
 
 __global__ void
@@ -57,8 +49,8 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // Load the matrices from device memory
         // to shared memory; each thread loads
         // one element of each matrix
-        AS(ty, tx) = A[a + wA * ty + tx];
-        BS(ty, tx) = B[b + wB * ty + tx];
+        As[ty][tx] = A[a + wA * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];
 
         // Synchronize to make sure the matrices are loaded
         __syncthreads();
@@ -67,7 +59,7 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
         // each thread computes one element
         // of the block sub-matrix
         for (int k = 0; k < BLOCK_SIZE; ++k)
-            Csub += AS(ty, k) * BS(k, tx);
+            Csub += As[ty][k] * Bs[k][tx];
 
         // Synchronize to make sure that the preceding
         // computation is done before loading two new
@@ -82,8 +74,8 @@ matrixMul(float* C, float* A, float* B, int wA, int wB)
 }
 
 void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C) {
-  cudaStream_t stream; 
-  cudaStreamCreate(&stream); 
+  cudaStream_t matMulStream; 
+  cudaStreamCreate(&matMulStream); 
   ElementType *h_A, *h_B, *h_C; 
   ElementType *d_A, *d_B, *d_C;
   int size = matrixSize * matrixSize * sizeof(ElementType);
@@ -99,18 +91,18 @@ void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C)
   memcpy(h_A, A, size);
   memcpy(h_B, B, size); 
 
-  cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream); 
-  cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream); 
+  cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, matMulStream); 
+  cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, matMulStream); 
 
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
   dim3 grid(matrixSize / threads.x, matrixSize / threads.y);
   
   // execute the kernel
-  matrixMul<<< grid, threads, 0, stream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
+  matrixMul<<< grid, threads, 0, matMulStream >>>(d_C, d_A, d_B, matrixSize, matrixSize);  
 
-  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream); 
+  cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, matMulStream); 
 
-  cudaStreamSynchronize(stream); 
+  cudaStreamSynchronize(matMulStream); 
 
   memcpy(C, h_C, size);
 
@@ -122,5 +114,5 @@ void cudaMatMul(int matrixSize, ElementType *A, ElementType *B, ElementType *C)
   cudaFree(d_B);
   cudaFree(d_C);
 
-  cudaStreamDestroy(stream); 
+  cudaStreamDestroy(matMulStream); 
 }
index 375dea4dfe3ce37a1d30c392731d78935092d4c5..76d4f2afc8f8100c9c9aea6e1bec782cb2b52ac4 100644 (file)
@@ -2,7 +2,6 @@
 #define __OVERLAP_TEST_CONSTS_H
 
 typedef float ElementType; 
-#define BLOCK_SIZE 16
-#define CHECK_BANK_CONFLICTS 0
+#define BLOCK_SIZE 8
 
 #endif
index cf94d8b125057b2ac19f4f00bfa3d8391891db49..01517d6734143a84cca8d766f10b2960c7751b10 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 
 #include "gaussSeidel3d.decl.h"
 #include "TopoManager.h"
index 7a68daa965caff63c1fb11cce94e5f85b55b6d5f..ffcd1a2c5fed014c99a2f7f0ffac56e0e1074b24 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file jacobi1d.C
  *  Author: Abhinav S Bhatele
  *  Date Created: July 16th, 2009
index 0a120080927949af4d22888cdce05c27c0e68988..b87f4112e9c7de2acbbaa0ffcc0c6d712e639308 100644 (file)
@@ -237,7 +237,7 @@ class Block: public CBase_Block {
     }
 
     void pup(PUP::er &p){
-      ArrayElement1D::pup(p); //pack our superclass
+      CBase_Block::pup(p);
       p(numNeighbors);
       p(numNborsRcvd);
 
index 0e549a2a2f207e8b83968997dadf9183ebdd8a30..73f246dbdd2d7bfea0b627281427230ad3661beb 100644 (file)
@@ -24,8 +24,8 @@ clean:
        rm -f *.decl.h *.def.h conv-host *.o stencil3d stencil3d.prj charmrun *~
 
 test: stencil3d
-       ./charmrun +p4 ./stencil3d 32 16 +balancer GreedyLB $(TESTOPTS)
-       ./charmrun +p4 ./stencil3d 32 16 +balancer RefineLB $(TESTOPTS)
+       ./charmrun +p4 ./stencil3d 64 32 +balancer RefineLB $(TESTOPTS)
+       ./charmrun +p4 ./stencil3d 64 32 +balancer GreedyLB $(TESTOPTS)
 
 bgtest: stencil3d
        ./charmrun +p4 ./stencil3d 32 16 +balancer CommLB +x2 +y2 +z1 +cth1 +wth1
diff --git a/examples/charm++/pupDisk/Makefile b/examples/charm++/pupDisk/Makefile
new file mode 100644 (file)
index 0000000..6de4fc8
--- /dev/null
@@ -0,0 +1,24 @@
+CHARMC=../../../bin/charmc $(OPTS)
+
+OBJS = pupDisk.o
+
+all: pupDisk
+
+pupDisk: $(OBJS)
+       $(CHARMC) -language charm++ -o pupDisk $(OBJS)
+
+pupDisk.decl.h: pupDisk.ci
+       $(CHARMC)  pupDisk.ci
+
+clean:
+       rm -f *.decl.h *.def.h conv-host *.o pupDisk charmrun
+
+pupDisk.o: pupDisk.C pupDisk.h someData.h pupDisk.decl.h
+       $(CHARMC) -c pupDisk.C
+
+test: all
+       ./charmrun pupDisk +p4 10 20 4 $(TESTOPTS)
+       ./charmrun pupDisk +p4 10 20 4 r $(TESTOPTS)
+
+bgtest: all
+       ./charmrun pupDisk +p4 10 +x2 +y2 +z1
diff --git a/examples/charm++/pupDisk/README b/examples/charm++/pupDisk/README
new file mode 100644 (file)
index 0000000..6826823
--- /dev/null
@@ -0,0 +1,30 @@
+Simple example of using PUP::toDisk and PUP::fromDisk for IO.
+
+The userData array represents a typical application chare array it has
+numElements and is constructed and placed without regard to I/O
+practicalities.  Each one contains a piece of data for which we want
+some file storage.
+
+Each pupDisk element is our intermediary to the file system. It has
+numFiles elements and should be placed one per node.  Numfiles could
+go as high as numcores, but at the limit that will not perform well.
+Each pupDisk will be responsible for output of numElements/numFiles
+elements.  Specifically it will handle the contiguous index space of
+[numElements/numFiles*thisIndex: numElements/numFiles*thisIndex +
+numElements/numFiles].  The last element may have fewer than
+numElements/numFiles and numElements will be tweaked to make this work.
+
+userData and pupDisk communicate using point to point sends.
+
+By default, the userData array will be initialized, pupped to disk, pupped from
+disk, and verified.
+
+If the 3rd command line argument is r, it will skip initialization and
+writing, to read from disk and verify.
+
+Make test embodies a simple working example of writing to disk and
+verifying that it works.  Theoretically the pup files should be
+portable across platforms, but I have not verified that for this
+example.
+
+
diff --git a/examples/charm++/pupDisk/pupDisk.C b/examples/charm++/pupDisk/pupDisk.C
new file mode 100644 (file)
index 0000000..47ee339
--- /dev/null
@@ -0,0 +1,248 @@
+////////////////////////////////////
+//
+//  pupDisk.C
+//
+//  Definition of chares in pupDisk
+//
+//  Author: Eric Bohm
+//  Date: 2012/01/23
+//
+////////////////////////////////////
+
+#include "pupDisk.h"
+CkCallback icb, rcb, wcb, vcb;
+CProxy_userData userDataProxy;
+CProxy_pupDisk pupDiskProxy;
+int numElementsPer;
+main::main(CkArgMsg *m)
+{
+
+  int numElements=10;
+  int size=20;
+  bool skipToRead=false;
+  int maxFiles=CkNumPes();
+  if(CmiCpuTopologyEnabled())
+    {
+      maxFiles=CmiNumPhysicalNodes();
+    }
+  if(m->argc>1)
+    numElements=atoi(m->argv[1]);
+  if(m->argc>2)
+    size=atoi(m->argv[2]);
+  if(m->argc>3)
+    maxFiles=atoi(m->argv[3]);
+  if(m->argc>4)
+    skipToRead=(m->argv[4][0]=='r');
+  delete m;
+  if(numElements/maxFiles<=0)
+    CkAbort("This works better with more elements than files");
+  //rejigger their choices, possibly reducing the number of files below max
+  numElementsPer=numElements/maxFiles;
+  if(numElements%maxFiles>0) ++numElementsPer;
+  maxFiles=numElements/numElementsPer;
+  if(numElements%numElementsPer) ++maxFiles;
+  CkPrintf("pupDisk numElements %d howBig %d maxFiles %d skip %d elements per file %d\n", numElements, size, maxFiles, skipToRead, numElementsPer);
+  icb = CkCallback(CkIndex_main::initialized(NULL),  thishandle);
+  wcb = CkCallback(CkIndex_main::written(NULL),  thishandle);
+  rcb = CkCallback(CkIndex_main::read(NULL),  thishandle);
+  vcb = CkCallback(CkIndex_main::done(NULL),  thishandle);
+  CProxy_pupDiskMap diskMapProxy = CProxy_pupDiskMap::ckNew(maxFiles);
+  CkArrayOptions mapOptions(maxFiles);
+  mapOptions.setMap(diskMapProxy);
+  pupDiskProxy= CProxy_pupDisk::ckNew(size,numElements,maxFiles,mapOptions);
+  pupDiskProxy.doneInserting();
+  userDataProxy= CProxy_userData::ckNew(size,numElements,maxFiles, numElements);
+  userDataProxy.doneInserting();
+  if(skipToRead)
+    {
+      CkPrintf("reading data\n");
+      userDataProxy.read();
+    }
+  else
+    {
+      userDataProxy.init();
+    }
+}
+
+
+
+void main::initialized(CkReductionMsg *m)
+  {
+    CkPrintf("writing data\n");
+    userDataProxy.write();
+  }
+void main::written(CkReductionMsg *m)
+  {
+    CkPrintf("reading data\n");
+    userDataProxy.read();
+  }
+void main::read(CkReductionMsg *m)
+  {
+    CkPrintf("verifying data\n");
+    userDataProxy.verify();
+  }
+
+void userData::init(){
+  CkAssert(myData); 
+  for (int i=0;i<howBig;++i) myData->data[i]=thisIndex;
+  contribute(sizeof(int), &thisIndex, CkReduction::sum_int, icb);
+}
+
+void userData::verify(){
+  CkAssert(myData); 
+  for (int i=0;i<howBig;++i) 
+    if(myData->data[i]!=thisIndex){
+      CkPrintf("[%d] element %d corrupt as %d\n", 
+              thisIndex, i, myData->data[i]);
+      CkAbort("corrupt element");
+    }
+  CkPrintf("[%d] verified\n",thisIndex);
+  contribute(sizeof(int), &thisIndex, CkReduction::sum_int, vcb);
+}
+
+void userData::write()
+{
+  
+  int fileNum = thisIndex/numElementsPer;
+  //  CkPrintf("[%d] userData write to file %d\n",thisIndex,fileNum);
+  pupDiskProxy[fileNum].write(thisIndex, *myData);
+}
+
+void userData::read()
+{
+  int fileNum = thisIndex/numElementsPer;
+  pupDiskProxy[fileNum].read(thisIndex);
+}
+
+void userData::acceptData(someData &inData){
+  for(int i=0; i<howBig; ++i) myData->data[i]=inData.data[i];
+  contribute(sizeof(int), &thisIndex, CkReduction::sum_int, rcb);
+}
+
+pupDisk::pupDisk(int _howbig, int _numElements, int _maxFiles): howBig(_howbig), numElements(_numElements), maxFiles(_maxFiles)
+  { elementsToWrite=numElementsPer; 
+    if(thisIndex==maxFiles-1 && numElements%numElementsPer>0) elementsToWrite=numElements%numElementsPer; 
+    dataCache=new someData[elementsToWrite]; 
+    count=0; 
+    nextSlot=0; 
+    //    CkPrintf("[%d] pupDisk constructed expecting elementsToWrite %d for / %d and %% %d\n",thisIndex, elementsToWrite, numElements/maxFiles, numElements%maxFiles);
+  }
+
+
+
+void pupDisk::read(int sender)
+{
+  if(diskRead(sender))
+    {
+      // the ugly verbose syntax for extracting what you want from an STL map
+      // never fails to annoy me.
+      int offset=(*lookupIdx.find(sender)).second;
+      userDataProxy[sender].acceptData(dataCache[offset]);
+    }
+}
+
+bool pupDisk::diskRead(int sender)
+{
+  if(!doneRead)
+    {
+      // get stuff from disk
+
+      // a more complicated caching scheme could pull less than the
+      // entire file and use a per entry flag system to track what is
+      // in cache.
+      doneRead=true;      
+      //      CkPrintf("[%d] reading from file for %d\n",thisIndex, sender);
+      char *d = new char[512];
+      sprintf(d, "%s.%d.%d.%d", "diskfile", numElements, howBig, thisIndex);
+      FILE *f = fopen(d,"r");
+      if (f == NULL) {
+       CkPrintf("[%d] Open failed with %s. \n", CkMyPe(), d);
+       CkAbort("\n");
+      }
+      // A simple scheme would require the user be consistent in their
+      // parameter choices across executions.  A more elaborate scheme
+      // codifies them in a block so the reader can do a lookup for
+      // the parameters used during writing.
+      PUP::fromDisk pd(f);
+      PUP::machineInfo machInfor;
+      pd((char *)&machInfor, sizeof(machInfor));       // machine info
+      if (!machInfor.valid()) {
+       CkPrintf("Invalid machineInfo on disk file when reading %d!\n", thisIndex);
+       CkAbort("");
+      }
+      PUP::xlater p(machInfor, pd);
+      int elementsToWriteFile;
+      p|elementsToWriteFile;
+      // safety check, for some formats you might be able to adjust
+      // properly if the file's parameters disagree from your instance's.
+      // This implementation is not that smart.
+      if(elementsToWriteFile==elementsToWrite)
+       {
+         p|lookupIdx;
+         someData input;
+         for(int i=0;i<elementsToWrite;++i)
+           {
+             dataCache[i].pup(p);
+           }
+       }
+      else
+       {
+         CkAbort("a pox upon your file format");
+       }
+      fclose(f);
+      delete [] d;
+    }
+  return doneRead;
+}
+
+void pupDisk::write(int sender, someData &inData)
+{
+  //  CkPrintf("[%d] pupDisk write for sender %d with count %d of elementsToWrite %d\n",thisIndex, sender, count, elementsToWrite);
+  lookupIdx[sender]=nextSlot;
+  dataCache[nextSlot++]=inData;
+  if(++count==elementsToWrite) 
+    diskWrite();
+
+}
+
+void pupDisk::diskWrite()
+{
+  //  CkPrintf("[%d] writing to file\n",thisIndex);
+  char *d = new char[512];
+  sprintf(d, "%s.%d.%d.%d", "diskfile", numElements, howBig, thisIndex);
+  FILE *f;
+  struct stat sb;
+  // a+ will force appending, which is not what we want
+  if(stat(d,&sb)==-1){
+      f = fopen(d,"w");  
+  }
+  else
+    {
+      f = fopen(d,"r+");
+    }
+  if (f == NULL) {
+    CkPrintf("[%d] Open for writing failed with %s \n", CkMyPe(), d);
+    CkAbort("\n");
+  }
+  PUP::toDisk p(f);
+  const PUP::machineInfo &machInfow = PUP::machineInfo::current();
+  //  CkPrintf("[%d] writing machineInfo %d bytes\n",thisIndex,sizeof(machInfow));
+  p((char *)&machInfow, sizeof(machInfow));       // machine info
+  if(!machInfow.valid())
+    {
+      CkPrintf("Invalid machineInfo on disk file when writing %d!\n", thisIndex);
+      CkAbort("");
+    }
+  p|elementsToWrite;
+  p|lookupIdx;
+  for(int i=0; i<elementsToWrite;i++)
+    dataCache[i].pup(p);
+  fflush(f);
+  fclose(f);
+  contribute(sizeof(int), &thisIndex, CkReduction::sum_int, wcb);
+  delete [] d;
+}
+
+
+
+#include "pupDisk.def.h"
diff --git a/examples/charm++/pupDisk/pupDisk.ci b/examples/charm++/pupDisk/pupDisk.ci
new file mode 100644 (file)
index 0000000..04e1f2a
--- /dev/null
@@ -0,0 +1,48 @@
+////////////////////////////////////////////////
+//
+//  pupDisk.ci
+//
+//  Interface file for pupDisk
+//
+//  Author: Eric Bohm
+//  Date: 2012/1/23
+//
+////////////////////////////////////////////////
+
+mainmodule pupDisk {
+  include "someData.h";
+  readonly CkCallback icb;
+  readonly CkCallback wcb;
+  readonly CkCallback rcb;
+  readonly CkCallback vcb;
+  readonly CProxy_userData userDataProxy;
+  readonly CProxy_pupDisk pupDiskProxy;
+  readonly int numElementsPer;
+  mainchare main {
+    entry main();
+    entry void done(CkReductionMsg *);
+    entry void initialized(CkReductionMsg *);
+    entry void written(CkReductionMsg *);
+    entry void read(CkReductionMsg *);
+
+  };
+  group pupDiskMap : CkArrayMap {
+          entry pupDiskMap(int maxFiles);
+  };
+
+  array [1D] userData{
+    entry userData(int howbig, int numElements, int maxFiles);
+    entry void init();
+    entry void write();
+    entry void read();
+    entry void verify();    
+    entry void acceptData(someData &inData);
+  };
+
+  array [1D] pupDisk{
+    entry pupDisk(int howbig, int numElements, int maxFiles);
+    entry void write(int sender, someData &data);
+    entry void read(int sender);
+  };
+
+};
diff --git a/examples/charm++/pupDisk/pupDisk.h b/examples/charm++/pupDisk/pupDisk.h
new file mode 100644 (file)
index 0000000..5fa1aba
--- /dev/null
@@ -0,0 +1,108 @@
+//////////////////////////////////////
+//
+//  pupDisk.h  
+//
+//  Declaration of chares in pupDisk
+//
+//  Author: Eric Bohm
+//  Date: 2012/01/23
+//
+//////////////////////////////////////
+
+#include "pupDisk.decl.h"
+#include <map>
+#include "pup_stl.h"
+class main : public CBase_main {
+public:
+  main(CkMigrateMessage *m) {}
+  main(CkArgMsg *m);
+  void done(CkReductionMsg *m){
+    CkPrintf("done\n");
+    CkExit();
+  }
+  void initialized(CkReductionMsg *m);
+  void written(CkReductionMsg *m);
+  void read(CkReductionMsg *m);
+
+};
+
+class pupDiskMap: public CkArrayMap
+{
+ public:
+  int maxFiles;
+ pupDiskMap(int _maxFiles):maxFiles(_maxFiles) {}
+  inline int procNum(int, const CkArrayIndex &iIndex)
+  {
+    int *index=(int *) iIndex.data();
+    int proc;
+    if(CmiCpuTopologyEnabled())
+      { // use physnode API
+       if(CmiNumPhysicalNodes() > maxFiles)
+         {
+           proc=CmiGetFirstPeOnPhysicalNode(index[0]);
+         }
+       else
+         { 
+           //cleverness could be tried, but we really don't care because you 
+           //want more files than is good for you.
+           proc=index[0]%CmiNumPes();
+         }
+      }
+    else
+      {
+       if(CmiNumNodes()>maxFiles)
+         {// 
+           proc=index[0]*CmiMyNodeSize();
+         }
+       else if (CmiNumPes()>maxFiles)
+         { //simple round robin because we don't really care
+           proc=index[0];
+         }
+       else //there is no good mapping
+         {
+           proc=index[0]%CkNumPes();
+         }
+      }
+    return proc;
+  }
+  
+};
+
+class userData : public CBase_userData {
+public:
+  userData(CkMigrateMessage *m) {}
+ userData(int _howbig, int _numElements, int _maxFiles): howBig(_howbig), numElements(_numElements), maxFiles(_maxFiles){ myData=new someData(howBig);}
+  ~userData(){ if(myData) delete myData;}
+  void init();
+  void read();
+  void write();
+  void writeDone();
+  void verify();
+  void acceptData(someData &inData);
+ private:
+  someData *myData;
+  int howBig;
+  int numElements;
+  int maxFiles;
+};
+
+class pupDisk : public CBase_pupDisk {
+public:
+  pupDisk(CkMigrateMessage *m) {}
+  pupDisk(int _howbig, int _numElements, int _maxFiles);
+  ~pupDisk(){ ;}
+  void read(int sender);
+  void write(int sender, someData &data);
+  void diskWrite();
+  bool diskRead(int sender);
+ private:
+  someData *dataCache;
+  bool doneRead;
+  int count;
+  int howBig;
+  int numElements;
+  int maxFiles;
+  int elementsToWrite;
+  std::map<int, int> lookupIdx;
+  int nextSlot;
+};
diff --git a/examples/charm++/pupDisk/someData.h b/examples/charm++/pupDisk/someData.h
new file mode 100644 (file)
index 0000000..0b5bc2c
--- /dev/null
@@ -0,0 +1,27 @@
+class someData 
+{
+ public:
+ someData(int _howBig):howBig(_howBig){data=new int[howBig];}
+  someData(){data=NULL; howBig=0;}
+  void pup(PUP::er &p)
+    {
+      // remember to pup your superclass if there is one
+      p|howBig;
+      if(p.isUnpacking())
+       data=new int[howBig];
+      PUParray(p,data,howBig);
+    }
+
+  inline someData &operator=(const someData &indata) {
+    if(data && howBig>0) delete [] data;
+    howBig=indata.howBig;
+    data=new int[howBig];
+    for(int i=0; i<howBig; ++i) data[i]=indata.data[i];
+    return *this;
+  }
+
+  
+  ~someData(){if (data); delete [] data;}
+  int howBig;
+  int *data;
+};
index d87cacaa0c94dc1daae6c43726785980fbe930f5..b8f096b99eabfed12bfdbafa37b9d9088919c744 100644 (file)
@@ -12,6 +12,7 @@ Msg::pack(Msg* m)
   *t = m->listsize; t++;
   for(int i=0;i<m->listsize; i++, t++)
     *t = m->list1[i];
+  delete [] m->list1;
   CkFreeMsg(m);
   return(p);
 }
index c4e9c20cee8aabba044c22de328292f2d940ddb4..614982ce67579b22512b6485789c6bc24e016719 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file jacobi2d.C
  *  Author: Abhinav S Bhatele
  *  Date Created: October 24th, 2007
index a09944d21c1c8f8aba0593debb48c347d4ff7f4b..ef7479a08afe2d4d93503a94092f2d9856de02df 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file jacobi3d.C
  *  Author: Abhinav S Bhatele
  *  Date Created: October 24th, 2007
index f4bdaef6be29d1a249898827817c33b2a2ff6340..572bbbedab902b538102199ed56349dc38888bc4 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file matmul3d.C
  *  Author: Abhinav S Bhatele
  *  Date Created: March 13th, 2008
index aaff993bf073a8499e7c67a62295ee8d0a3acd2d..ca2fd27eb1cc549eb062ff77b92717c0ca47f0f7 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** \file matmul3d.h
  *  Author: Abhinav S Bhatele
  *  Date Created: March 13th, 2008
index f0f7d449fb8cd397d66cccffaa99ddbc15516d5c..2da420e0740fd32176de95bef0996a5e808e2218 100644 (file)
@@ -9,7 +9,7 @@ TESTFLAGS = $(TESTOPTS)
 
 all: $(BINARY)
 $(BINARY): $(patsubst %.cc,%.o,$(wildcard *.cc))
-       $(CHARMC) $(CHARMCLINKFLAGS) -o $@ $+
+       $(CHARMC) $(CHARMCLINKFLAGS) -o $@ $+ 
 
 .SECONDARY: $(patsubst %.cc,%.decl.h,$(wildcard *.cc))
 .SECONDARY: $(patsubst %.cc,%.def.h,$(wildcard *.cc))
index 3d102f5a7471abed98d5c9e28e528f145f2b381a..a19b0f4ed926316164f0b3761b3b034df60b6fec 100644 (file)
@@ -15,7 +15,6 @@ void Driver::untyped_done(CkReductionMsg* m) {
     int* output = (int*)m->getData();
     CkPrintf("Untyped Sum: %d\n", output[0]);
     delete m;
-
     CkCallback *cb = new CkCallback(
             CkReductionTarget(Driver, typed_done), thisProxy);
     w.ckSetReductionClient(cb);
@@ -62,18 +61,18 @@ void Driver::typed_array_done3(int n, double* results)
 Worker::Worker() { }
 
 void Worker::reduce() {
-    int contribution = 1;
+    int contribution=1;
     contribute(1*sizeof(int), &contribution, CkReduction::sum_int); 
 }
 
 void Worker::reduce_array() {
-    int contribution[3] = { 1, 2, 3 };
-    contribute(3*sizeof(int), &contribution, CkReduction::sum_int); 
+    int contribution[3]={1,2,3};
+    contribute(3*sizeof(int), contribution, CkReduction::sum_int); 
 }
 
 void Worker::reduce_array_doubles() {
     double contribution[3] = { 0.16180, 0.27182, 0.31415 };
-    contribute(3*sizeof(double), &contribution, CkReduction::sum_double);
+    contribute(3*sizeof(double), contribution, CkReduction::sum_double);
 }
 
 #include "TypedReduction.def.h"
index a218543df5161e984e2f482a7b97b039ecd5b8d6..2b92a09754d61bff39a52dc82f1e72a19188e126 100644 (file)
@@ -9,8 +9,8 @@
 #include <stdlib.h>
 #include <converse.h>
 
-enum {nCycles =4096};
-enum { maxMsgSize = 1 << 17 };
+enum {nCycles = 1000};
+enum { maxMsgSize = 1 << 14 };
 
 CpvDeclare(int,msgSize);
 CpvDeclare(int,cycleNum);
@@ -120,7 +120,7 @@ CmiStartFn mymain()
     CpvInitialize(int,msgSize);
     CpvInitialize(int,cycleNum);
     
-    CpvAccess(msgSize)= 4 + CmiMsgHeaderSizeBytes;
+    CpvAccess(msgSize)= 512 + CmiMsgHeaderSizeBytes;
     
     CpvInitialize(int,exitHandler);
     CpvAccess(exitHandler) = CmiRegisterHandler((CmiHandler) exitHandlerFunc);
index 95c46b0bc69195b9f75d4ab3ca957abce525c140..a55f5b817f15fd0a4d0fd3e714cf3ebbeb2819db 100644 (file)
@@ -2,7 +2,7 @@
 # needs $(PGM)
 
 OPTS=
-CDIR=../../../..
+CDIR=../../..
 CHARMC=$(CDIR)/bin/charmc -language charm++ $(OPTS)
 
 # Rules to convert .ci to .decl.h and .def.h
diff --git a/examples/multiphaseSharedArrays/histogram/Makefile b/examples/multiphaseSharedArrays/histogram/Makefile
new file mode 100644 (file)
index 0000000..5c7f5c4
--- /dev/null
@@ -0,0 +1,3 @@
+
+PGM=histogram
+include ../Makefile_common
diff --git a/examples/multiphaseSharedArrays/histogram/headers b/examples/multiphaseSharedArrays/histogram/headers
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/examples/multiphaseSharedArrays/histogram/histogram b/examples/multiphaseSharedArrays/histogram/histogram
new file mode 100755 (executable)
index 0000000..b914917
Binary files /dev/null and b/examples/multiphaseSharedArrays/histogram/histogram differ
diff --git a/examples/multiphaseSharedArrays/histogram/histogram.C b/examples/multiphaseSharedArrays/histogram/histogram.C
new file mode 100644 (file)
index 0000000..0a49148
--- /dev/null
@@ -0,0 +1,130 @@
+// -*- mode: c++; tab-width: 4 -*-
+//
+#include "msa/msa.h"
+
+typedef MSA::MSA2D<int, DefaultEntry<int>,
+        MSA_DEFAULT_ENTRIES_PER_PAGE, MSA_ROW_MAJOR> MSA2D;
+typedef MSA::MSA1D<int, DefaultEntry<int>, MSA_DEFAULT_ENTRIES_PER_PAGE> MSA1D;
+
+#include "histogram.decl.h"
+
+
+const unsigned int ROWS = 2000;
+const unsigned int COLS = 2000;
+const unsigned int BINS = 10;
+const unsigned int MAX_ENTRY = 1000;
+unsigned int WORKERS = 10;
+
+
+class Driver : public CBase_Driver
+{
+public:
+    Driver(CkArgMsg* m)
+    {
+        // Usage: histogram [number_of_worker_threads]
+        if (m->argc > 1) WORKERS=atoi(m->argv[1]);
+        delete m;
+
+        // Actually build the shared arrays: a 2d array to hold arbitrary
+        // data, and a 1d histogram array.
+        MSA2D data(ROWS, COLS, WORKERS);
+        MSA1D bins(BINS, WORKERS);
+        // Create worker threads and start them off.
+        workers = CProxy_Histogram::ckNew(data, bins, WORKERS);
+        workers.ckSetReductionClient(
+            new CkCallback(CkIndex_Driver::done(NULL), thisProxy));
+        workers.start();
+    }
+
+    void done(CkReductionMsg* m)
+    {
+        // When the reduction is complete, everything is ready to exit.
+        CkExit();
+    }
+};
+
+
+class Histogram: public CBase_Histogram
+{
+public:
+    MSA2D data;
+    MSA1D bins;
+
+    Histogram(const MSA2D& data_, const MSA1D& bins_)
+    : data(data_), bins(bins_)
+    {}
+
+    Histogram(CkMigrateMessage* m)
+    {}
+
+    ~Histogram()
+    {}
+
+    // Note: it's important that start is a threaded entry method
+    // so that the blocking MSA calls work as intended.
+    void start()
+    {
+        data.enroll(WORKERS);
+        bins.enroll(WORKERS);
+        
+        // Fill the data array with random numbers.
+               MSA2D::Write wd = data.getInitialWrite();
+        if (thisIndex == 0) fill_array(wd);
+
+        // Fill the histogram bins: read from the data array and
+        // accumulate to the histogram array.
+               MSA2D::Read rd = wd.syncToRead();
+        MSA1D::Accum ab = bins.getInitialAccum();
+        fill_bins(ab, rd);
+
+        // Print the histogram.
+        MSA1D::Read rb = ab.syncToRead();
+        if (thisIndex == 0) print_array(rb);
+
+        // Contribute to Driver::done to terminate the program.
+        contribute();
+    }
+
+    void fill_array(MSA2D::Write& w)
+    {
+        // Just let one thread fill the whole data array
+        // with random entries to be histogrammed.
+        // 
+        // Note: this is potentially a very inefficient access
+        // pattern, especially if the MSA doesn't fit into
+        // memory, but it can be convenient.
+        for (unsigned int r = 0; r < data.getRows(); r++) {
+            for (unsigned int c = 0; c < data.getCols(); c++) {
+                w.set(r, c) = random() % MAX_ENTRY;
+            }
+        }
+    }
+
+    void fill_bins(MSA1D::Accum& b, MSA2D::Read& d)
+    {
+        // Determine the range of the data array that this
+        // worker should read from.
+        unsigned int range = ROWS / WORKERS;
+        unsigned int min_row = thisIndex * range;
+        unsigned int max_row = (thisIndex + 1) * range;
+        
+        // Count the entries that belong to each bin and accumulate
+        // counts into the bins.
+        for (unsigned int r = min_row; r < max_row; r++) {
+            for (unsigned int c = 0; c < data.getCols(); c++) {
+                unsigned int bin = d.get(r, c) / (MAX_ENTRY / BINS);
+                b(bin) += 1;
+            }
+        }
+    }
+
+    void print_array(MSA1D::Read& b)
+    {
+        for (unsigned int i=0; i<BINS; ++i) {
+            CkPrintf("%d ", b.get(i)); 
+        }
+    }
+};
+
+#include "histogram.def.h"
diff --git a/examples/multiphaseSharedArrays/histogram/histogram.ci b/examples/multiphaseSharedArrays/histogram/histogram.ci
new file mode 100644 (file)
index 0000000..e0dd682
--- /dev/null
@@ -0,0 +1,23 @@
+// -*- mode: c++; tab-width: 4 -*-
+mainmodule histogram
+{
+    mainchare Driver
+    {
+        entry void Driver(CkArgMsg*);
+        entry void done(CkReductionMsg*);
+    };
+
+    array[1D] Histogram
+    {
+        entry void Histogram(MSA2D data_, MSA1D bins_);
+        entry [threaded] void start();
+    };
+
+    
+    /* Currently, you must explicitly instantiate any
+       MSA templates that you use. */
+    group MSA_CacheGroup<int, DefaultEntry<int>,
+                         MSA_DEFAULT_ENTRIES_PER_PAGE>;
+    array [1D] MSA_PageArray<int, DefaultEntry<int>,
+                             MSA_DEFAULT_ENTRIES_PER_PAGE>;
+};
diff --git a/examples/multiphaseSharedArrays/histogram/run.sh b/examples/multiphaseSharedArrays/histogram/run.sh
new file mode 100755 (executable)
index 0000000..3cfcad3
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+# Shell script to test for multiple test cases
+
+touch outputs
+for rows1 in 1000 5000 10000; do
+  for cols1 in 500 750 1000; do
+    for cols2 in 1000 5000 10000; do
+      for mbytes in 128 64 32 16 8 4 2 1; do
+       for num_workers in 1 2 4 8 16 32; do
+          rm -rf params.h
+          printf "const unsigned int bytes = %d*1024*1024;\n" $mbytes >> params.h
+          printf "const unsigned int ROWS1 = %d;\n" $rows1 >> params.h
+          printf "const unsigned int COLS1 = %d;\n" $cols1 >> params.h
+          printf "const unsigned int COLS2 = %d;\n" $cols2 >> params.h
+          printf "const unsigned int ROWS2 = COLS1;\n" >> params.h
+          printf "const unsigned int NUM_WORKERS = %d;\n" $num_workers >> params.h
+          printf "\n" >> params.h
+  
+          rm -f t3
+          make OPTS=-O3 -s
+          for num_pes in 4 8 16 32; do
+            ./charmrun transpose +p$num_pes >> outputs
+          done
+        done
+      done
+    done
+  done
+done
index f1f414978f4438650be357a28508cb5910ce1a3d..d95a8f2b4d7e6d15e48b363b438b279fb29b691a 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /* The data structure/ ADT for the edge-list */
 #include <stdio.h>
 #include <stdlib.h>
index ae68450cabf69d64a2df53e46077faaba152409d..345aa6bb19ec2e2b8361603b1a7c729236a8a14e 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /* Generate a random graph, given the number of nodes, the number of
    connections per node.
 
index 8a0e72244e208105a0aaa184e6b700d79c081063..26f9c0ca2504fdb726b9e9d1f153c2cd5c94f6a0 100644 (file)
@@ -1,11 +1,3 @@
-
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /**
  * \addtogroup CkLdb
 */
index d183642976b857886ec061cb5730a67212ac625e..e498e3bb1f3f76bc6bbbd9f013ccd1e70f83176e 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /**
  * \addtogroup CkLdb
 */
index c0f755028e86a017ab303bc4903ad73d82aced49..cf7001afe06930fbec05ebdc9c9b8aa04976bc5b 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #ifndef _TYPEDEFS_H
 #define _TYPEDEFS_H
 
index 9f368404dfde4d0c0f7b200eb4ce9e4eb189a695..8a2361f9e339c044d25b1ecb8371a41a4291d48c 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 4faa03ed99602bb7519649f6aa1571f4128db155..ff951a0d315740d2b8df406832bb3430d4859883 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 219479dfb3f525275be5668dc76e6c569a84ed9f..0df98de88dc9478ba49c535be64ca925262e8919 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index ba0fb77c3ce38ada3c4f22385bd528812f139330..158fe2703abc1ddbe89a89d1c3f7d626be63aae0 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 26b80a2e6f74d16a179aac3d0be84504c825538a..83537a3c20fccdeb9ebdde1100bb8961096ade0a 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index e3f0314f54d9f8f1af7dd546a579cb8d6d60a0d6..9e3ae8ba867c3271500411fd4526e54ebd3c2354 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 0156584218c7d2c747fff4b26b5c41f56aa864d8..df7e07a8544512ef1576c22c2ce92d116e161b78 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 562505dbaa3173a6071819ae398dccfc696b3d30..c584a681edd8b438edc5f09fddce005f994f116d 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 3182195e77f1925c5bd4f5b2d89296a940c7b595..6bf6db7f34cecd7539e03e96545bdbce4f5e4fed 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 23e40be596e2e590151f92976c5670382adbedcf..9b6b19d76a754573f483f082573cd64dc965ac2a 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 
 #include "qt.h"
 #include <setjmp.h>
index 177d9f61ed466f080de60adacfcca63ad5e2e2ce..a83b337201001645083f43be9c78a09451f5f03f 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 
 #include "qt.h"
 #include <setjmp.h>
@@ -32,7 +25,7 @@ struct helpdesc { qt_helper_t *hfn; qt_t *jb; void *old; void *new; };
 
 #define MAXTABLE 1000
 
-#if CMK_SMP && CMK_TLS_THREAD
+#if CMK_SMP && CMK_HAS_TLS_VARIABLES
 #define TLS_SPECIFIER         __thread
 #else
 #define TLS_SPECIFIER
index 08a90002cb354993a3861500587f13056fd8badc..f1f035e00a87d9dab694954d5e0a0d9a5d6b3e8f 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
 _setjmp/_longjmp is much light weighted than setjmp/longjmp
 because it does not store signal mask, which is not a problem for netpoll
@@ -39,7 +32,7 @@ struct helpdesc { qt_helper_t *hfn; qt_t *jb; void *old; void *new; };
 
 #define MAXTABLE 1000
 
-#if CMK_SMP && CMK_TLS_THREAD
+#if CMK_SMP && CMK_HAS_TLS_VARIABLES
 #define TLS_SPECIFIER         __thread
 #else
 #define TLS_SPECIFIER
index d2a76b988e0ef50dba3eb636e451eca992162a84..5fd640f7fe0cd5b8e3f546b9c66b57e63231ab1b 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 
 void b_call_reg()
 {
index 3e66f5ff7f49b9eabe603126b4336e0a09eb1126..1b7dfcb37374e87d9b3173d3862fd61855512dcf 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index f86fea291deae0f98a8cc193c5829b2ff82d9266..f8f3e7828485d375536c88af4998d3ddf982ae39 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
 _setjmp/_longjmp is much light weighted than setjmp/longjmp
 because it does not store signal mask, which is not a problem for netpoll
index 9250b8ccf4d9e7d4c8942d41ff3778e053d31760..96848aa29bb7b876bced9cac5e799757e8d7742f 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index dec3d79e497456a82db22dddb5d19b00aa41776f..4ffcc51c2d6bd6468dfc210d53431bb97e1597eb 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 
 #include "qt.h"
 #include <setjmp.h>
index 1be8fbf183527500bd863de74877e2918005b37c..e2ab281b44f548a8684c454378d65149bbf7fe52 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 353d9b90ef5e32290cd09297f1a206737b2b7acd..8c68936b7c70d27f37543c2c1a40e31bbb83e0c6 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 
 void qt_ni()
 {
index e99fc0d5fdde556086d5321d73868f9370299a83..a2807c09164db369ff1c680d0279c0f7269fbe01 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index fc7a81c218f30944ce5d8030762c7cf48a617fbe..3363165e3c52dbe9077c48c2ba5f18fbd2804b70 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 
 void b_call_reg()
 {
index 0f81bed8710066eacc1455ec7e88701741955709..39ca04d3d135889a32938782fc685abca7970f63 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index c5e632d55389c46af34e6d2dcc29b49e01c46259..1a5af0f2bf69d926cfc68421ae2af3ea9110783d 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /*
  * QuickThreads -- Threads-building toolkit.
  * Copyright (c) 1993 by David Keppel
index 200fe983bae341818cf125108758a6bec18a3d0a..fb006fe7146b14cea97b73602bec536d408b417d 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /* meas.c -- measure qt stuff. */
 
 #include <stdio.h>
index 7e956c30b7e57088f4bd758997584a11c0d8e08e..175ce177d8c7a8f42025ebcda8e1bda40b4fcc0e 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #include "copyright.h"
 #include "qt.h"
 
index 1f08d0e7119fe3b0ca5aa248d4ca52979a01af54..b7c7fb4dbb97480d80003d6fe9f34e1349f7464a 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #ifndef QT_H
 #define QT_H
 
index 0492861efa4a83e34c43d02c735b56f384d5f1e8..219c69285d089b36aa1336ef7e810888726fa69c 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #ifndef B_H
 #define B_H    "$Header$"
 
index cc61a3fc98483741976ab5e5948e3f07ba1dcf7d..bfacc893b000d66e3c2900e95d4845186aaac5de 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #include "copyright.h"
 #include "qt.h"
 #include "stp.h"
index b5d0e1ac4fb23c618a2776ac19e85861ca9d858b..1220e47e2c25270d27c462f8462b85a85f3f02cd 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #ifndef STP_H
 #define STP_H
 
index 4de1adf4f3b8dda13e8f751121f65542c4cc8032..cdb3d9fbe0956463753b2794a2b0788a2ef89537 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #include <stdio.h>
 #include <errno.h>
 #include <stdlib.h>
index 3351e87ae96045b93df473749c0dce6ba0549b3a..86403cdbc5b1d5ac145a04fcdf863f9034e238b8 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #ifndef _CONV_MACH_H
 #define _CONV_MACH_H
 
@@ -32,7 +25,6 @@
 
 #define CMK_NODE_QUEUE_AVAILABLE                           0
 
-#define CMK_SHARED_VARS_EXEMPLAR                           0
 #define CMK_SHARED_VARS_UNAVAILABLE                        1
 #define CMK_SHARED_VARS_UNIPROCESSOR                       0
 
index 0c730f8ee5f271486fbfc705fc232aff3ab299ac..30bbcdc286fbefe7d87659b11c139b24452b0983 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 /** @file
  * MPI based machine layer
  * @ingroup Machine
index 7c35a4bddaa5ede16c64c64bee8e00e406cf6088..c777c633e5d35a8790637f8cd5a73e25102bb314 100644 (file)
@@ -1,2 +1,2 @@
 #  Bluegene/P specific Make rules
-
+$(L)/libconv-cplus-n.a: machine.c machine-common-core.c machine-broadcast.c machine-lrts.h machine-commthd-util.c
index 6da549f060344cfa45b5519190b913216eea4af6..99767dda109b0a84809cb3f49bb66cf83bc2bfba 100644 (file)
@@ -8,8 +8,8 @@ CMK_LDXX="$CMK_CXX $BGP_LIB"
 CMK_CF77="$XLC_F/${XLC_POST}xlf "
 CMK_CF90="$XLC_F/${XLC_POST}xlf90  -qsuffix=f=f90" 
 CMK_CF90_FIXED="$XLC_PRE/xlf/bg/11.1/${XLC_POST}xlf90 " 
-CMK_C_OPTIMIZE='-O3 -qstrict -Q '
-CMK_CXX_OPTIMIZE='-O3 -qstrict -Q '
+CMK_C_OPTIMIZE='-O3 -qstrict -qarch=450 -Q '
+CMK_CXX_OPTIMIZE='-O3 -qstrict -qarch=450 -Q '
 CMK_AR='ar cq'
 CMK_NM='nm '
 CMK_QT="aix"
index 826d65b0aca21005ee3fd6af3a142f081babc9ca..8f9d3570d0f890eb14c8ac4e6ce20a12484e4591 100755 (executable)
@@ -49,8 +49,8 @@ printf "\nRunning on $pes processors: $args\n"
 if test -n "$COBALT_JOBID"
 then
   # charmrun called from script
-  echo "Running> cobalt-mpirun -np $pes $args"
-  cobalt-mpirun -np $pes $args
+  echo "Running> cobalt-mpirun -nofree -env BG_MAXALIGNEXP=0 -np $pes $args"
+  cobalt-mpirun -nofree -env BG_MAXALIGNEXP=0 -np $pes $args
 else
 
 queue_stat=qstat
index 27ec1d463ef418e2d95de50c17672936be3d5363..edb6d4b1003100d0de1d1bc8a4892a875c76ff55 100644 (file)
@@ -22,7 +22,6 @@
 
 #define CMK_NODE_QUEUE_AVAILABLE                           0 
 
-#define CMK_SHARED_VARS_EXEMPLAR                           0
 #define CMK_SHARED_VARS_UNAVAILABLE                        1
 #define CMK_SHARED_VARS_UNIPROCESSOR                       0
 
index d17c925c48c8013e99c20a4ca41dcd76f7b41c2c..15d3ffd1ecebcc9c42fabc27bbab3a817e5c0591 100644 (file)
@@ -293,7 +293,7 @@ static void MachinePostCommonInitForDCMF(int everReturn);
 /* ### End of Machine-startup Related Functions ### */
 
 /* ### Beginning of Machine-running Related Functions ### */
-static void AdvanceCommunicationForDCMF();
+static void AdvanceCommunicationForDCMF(int whenidle);
 #define LrtsAdvanceCommunication AdvanceCommunicationForDCMF
 
 static void DrainResourcesForDCMF();
@@ -673,7 +673,7 @@ extern void        bgl_machine_RectBcastInit  (unsigned               commID,
 
 
 /* ######Beginning of functions related with communication progress ###### */
-static INLINE_KEYWORD void AdvanceCommunicationForDCMF() {
+static INLINE_KEYWORD void AdvanceCommunicationForDCMF(int whenidle) {
 #if CMK_SMP
     DCMF_CriticalSection_enter (0);
 #endif
@@ -695,7 +695,7 @@ static void MachinePostNonLocalForDCMF() {
    messages. This flushes receive buffers on some  implementations*/
 #if CMK_MACHINE_PROGRESS_DEFINED
 void CmiMachineProgressImpl() {
-    AdvanceCommunicationForDCMF();
+    AdvanceCommunicationForDCMF(0);
 #if CMK_IMMEDIATE_MSG
     CmiHandleImmediate();
 #endif
@@ -705,7 +705,7 @@ void CmiMachineProgressImpl() {
 /* ######Beginning of functions related with exiting programs###### */
 static void DrainResourcesForDCMF() {
     while (msgQueueLen > 0 || outstanding_recvs > 0) {
-        AdvanceCommunicationForDCMF();
+        AdvanceCommunicationForDCMF(0);
     }
 }
 
@@ -732,6 +732,7 @@ static void MachineInitForDCMF(int *argc, char ***argv, int *numNodes, int *myNo
 
     DCMF_Messager_configure(&config_in, &config_out);
     //assert (config_out.thread_level == DCMF_THREAD_MULTIPLE); //not supported in vn mode
+    Cmi_smp_mode_setting = COMM_THREAD_ONLY_RECV;
 #endif
 
     DCMF_Send_Configuration_t short_config, eager_config, rzv_config;
@@ -874,7 +875,7 @@ static void MachinePostCommonInitForDCMF(int everReturn) {
  *
  ************************************************************************/
 
-void CmiAbort(const char *message) {
+void LrtsAbort(const char *message) {
     CmiError("------------- Processor %d Exiting: Called CmiAbort ------------\n"
              "{snd:%d,rcv:%d} Reason: %s\n",CmiMyPe(),
              msgQueueLen, outstanding_recvs, message);
index 9bc52d5a26717d6ac2003493c6410545492f3d37..bf3cc2c8293cde0d65a9c8c0351a9f17177fbc5b 100644 (file)
@@ -40,7 +40,7 @@ else
   f95target=`gfortran -v 2>&1 | grep Target | cut -f2 -d' '`
   f95version=`gfortran -v 2>&1 | grep 'gcc version' | cut -d' ' -f3`
   F90LIBDIR=`cd $F90DIR/../lib/gcc/$f95target/$f95version/ 2>/dev/null && pwd`
-  test -n "$F90LIBDIR" && F90LIBDIR=`cd $F90DIR/../lib/$f95target/gcc/$f95version/ 2>/dev/null && pwd`
+  test -z "$F90LIBDIR" && F90LIBDIR=`cd $F90DIR/../lib/$f95target/gcc/$f95version/ 2>/dev/null && pwd`
   #F90LIBDIR=`cd $F90DIR/../lib/gcc/ia64-unknown-linux-gnu/4.1.0; pwd`
 fi
 test -n "$F90LIBDIR" && CMK_F90LIBS="-L$F90LIBDIR -lgfortran -lgcc_eh"
index 76556ad84d3295a1b18e40af3696e9b3b155807c..08a8883a6c07cfa58e3accb9401c77960c8e0272 100644 (file)
@@ -1,13 +1,11 @@
-CUDA_DIR=/usr/local/cuda
-CHARMDIR=../../
+#CUDA_DIR=/usr/local/cuda
+CHARMDIR=../..
 CHARMC=$(CHARMDIR)/bin/charmc
 
 NVCC = $(CUDA_DIR)/bin/nvcc
 NVCC_FLAGS = -c -use_fast_math -DGPU_MEMPOOL #-DGPU_PROFILE -DGPU_TRACE #-DGPU_MEMPOOL_DEBUG -DGPU_WRQ_VERBOSE #-device-debug -deviceemu
 NVCC_INC = -I$(CUDA_DIR)/include -I.. 
 
-RM = rm -f
-
 all: libs
        cp libcudahybridapi.a $(CHARMDIR)/lib
 
@@ -18,7 +16,7 @@ install: libcudahybridapi.a
 #      cp hybridapi.o wrqueue.o $(CHARMDIR)/tmp
 
 libcudahybridapi.a: hybridapi.o wrqueue.o
-       -rm $@
+       -rm -f $@
        ar q $@ hybridapi.o wrqueue.o 
 
 hybridapi.o: cuda-hybrid-api.cu cuda-hybrid-api.h
index b20d40b16d3c101dc7cef77c032a9a3715fa7369..7b7fae4ccb56ef79d5a69e40d24cb1c12f2d9ebb 100644 (file)
@@ -29,9 +29,7 @@ void cudaErrorDie(int err,const char *code,const char *file,int line)
          " Return value %d from '%s'.  Exiting.\n",
          file,line,
          err,code);
-  int ret;
   abort();
-  exit(ret);
 }
 
 #define cudaChk(code)                                                  \
@@ -951,7 +949,9 @@ void *getBufferFromPool(int pool, int size){
   else if (memPoolFreeBufs[pool].head == NULL){
     Header *hd;
     cudaChk(cudaMallocHost((void **)&hd, sizeof(Header)+memPoolFreeBufs[pool].size));
+#ifdef GPU_MEMPOOL_DEBUG
     printf("(%d) getBufferFromPool, pool: %d, size: %d expand by 1\n", CmiMyPe(), pool, size);
+#endif
     if(hd == NULL){
       abort();
     }
index 150580472b81bb50a2419ebbb7aca43ca292c9cd..78b4d939c4ea94aa5d6ea4fc62b11e7b88ee872f 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author$
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #ifndef _CONV_MACH_H
 #define _CONV_MACH_H
 
@@ -23,7 +16,6 @@
 
 #define CMK_NODE_QUEUE_AVAILABLE                           0
 
-#define CMK_SHARED_VARS_EXEMPLAR                           0
 #define CMK_SHARED_VARS_UNAVAILABLE                        1
 #define CMK_SHARED_VARS_UNIPROCESSOR                       0
 
index 03625d2f01d02cbed71e5ff993ce90f2f5e99c1a..779d04f441784d5393c404bbfe6f35227a1cf279 100644 (file)
@@ -1,10 +1,3 @@
-/*****************************************************************************
- * $Source$
- * $Author: Isaac Dooley
- * $Date$
- * $Revision$
- *****************************************************************************/
-
 #ifndef _CONV_MACH_H
 #define _CONV_MACH_H
 
@@ -27,8 +20,8 @@
 #define CMK_THREADS_USE_PTHREADS                           0
 #define CMK_THREADS_ARE_WIN32_FIBERS                       0
 
-#define CMK_SIGNAL_NOT_NEEDED                              0
-#define CMK_SIGNAL_USE_SIGACTION                           1
+#define CMK_SIGNAL_NOT_NEEDED                              1
+#define CMK_SIGNAL_USE_SIGACTION                           0
 #define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART              0
 
 #define CMK_THREADS_REQUIRE_NO_CPV                         0
index a5b558488585aff5336431482001f6a363ce9723..44d47ab1cf8970f8a50eec531792d72b5e8c23f3 100644 (file)
@@ -5,7 +5,7 @@
 #define CMK_USE_PXSHM                  1
 
 #undef CMK_IMMEDIATE_MSG
-#define CMK_IMMEDIATE_MSG       0
+#define CMK_IMMEDIATE_MSG       1
 
 #undef CMK_BROADCAST_HYPERCUBE
 #define CMK_BROADCAST_HYPERCUBE                                   1
index b30f6498e04c4da590caa8dfb4427f6ca33b8b86..0e85cc1e4f6c9edf001331ed9804bd152f12642e 100644 (file)
@@ -11,3 +11,7 @@
 #undef CMK_THREADS_USE_CONTEXT
 #define CMK_THREADS_USE_CONTEXT                            1
 
+
+#if ! CMK_GCC_X86_ASM
+#define CMK_PCQUEUE_LOCK                                   1
+#endif
diff --git a/src/arch/gemini_gni-crayxe/conv-mach-xpmem.h b/src/arch/gemini_gni-crayxe/conv-mach-xpmem.h
new file mode 100644 (file)
index 0000000..39cc4d8
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef _CONV_MACH_XPMEM_
+#define  _CONV_MACH_XPMEM
+
+#undef CMK_USE_PXSHM
+#undef CMK_USE_XPMEM
+#define CMK_USE_XPMEM                  1
+
+#undef CMK_IMMEDIATE_MSG
+#define CMK_IMMEDIATE_MSG       0
+
+#undef CMK_BROADCAST_HYPERCUBE
+#define CMK_BROADCAST_HYPERCUBE                                   1
+
+#undef CMK_WHEN_PROCESSOR_IDLE_USLEEP
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP  0
+
+#define XPMEM_LOCK                      1
+
+#endif
diff --git a/src/arch/gemini_gni-crayxe/conv-mach-xpmem.sh b/src/arch/gemini_gni-crayxe/conv-mach-xpmem.sh
new file mode 100644 (file)
index 0000000..a269d3c
--- /dev/null
@@ -0,0 +1 @@
+#CMK_LIBS="$CMK_LIBS -lrt"
index 0b400dce48683e5262a2f665ea31ef9b9d61be26..4be2921f33093adaff94eab7fe796067f328af8b 100644 (file)
@@ -32,7 +32,6 @@
 #define CMK_SHARED_VARS_UNAVAILABLE                        1 /* non SMP versions */
 #define CMK_SHARED_VARS_POSIX_THREADS_SMP                  0 /* SMP versions */
 #define CMK_SHARED_VARS_UNIPROCESSOR                       0
-#define CMK_SHARED_VARS_EXEMPLAR                           0
 #define CMK_SHARED_VARS_PTHREADS                           0
 #define CMK_SHARED_VARS_NT_THREADS                         0
 
 #define CMK_SIGNAL_USE_SIGACTION                           0
 #define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART              0
 
-/* specifies whether the CthCpv variables should be defined as Cpv (0) or
-   directly as normal c variables (1) */
-#define CMK_THREADS_REQUIRE_NO_CPV                         0
-
 /* decide which is the default implementation of the threads (see threads.c)
    Only one of the following can be 1. If none of them is selected, qthreads
    will be used as default. This default can be overwritten at compile time
@@ -62,8 +57,8 @@
    implement the timer primitives. */
 #define CMK_TIMER_USE_RTC                                  0
 #define CMK_TIMER_USE_RDTSC                                0
-#define CMK_TIMER_USE_GETRUSAGE                            1
-#define CMK_TIMER_USE_SPECIAL                              0
+#define CMK_TIMER_USE_GETRUSAGE                            0
+#define CMK_TIMER_USE_SPECIAL                              1
 #define CMK_TIMER_USE_TIMES                                0
 #define CMK_TIMER_USE_BLUEGENEL                            0
 
index adbf27ac88d5014ad275a68b4a810461d5bc89cf..699aa9a48a5de399b1accb2688d27b77f575fdfb 100644 (file)
@@ -1,2 +1,5 @@
-$(L)/libconv-cplus-n.a: machine.c machine-common-core.c machine-broadcast.c machine-lrts.h machine-pxshm.c machine-persistent.c
+$(L)/libconv-cplus-n.a: machine.h machine.c machine-common-core.c machine-broadcast.c machine-lrts.h machine-pxshm.c machine-xpmem.c machine-persistent.c machine-commthd-util.c machine-smp.c pcqueue.h  $(L)/cray_tlbhack.o
+
+$(L)/cray_tlbhack.o: cray_tlbhack.c
+       $(CHARMC) -o $@ cray_tlbhack.c
 
index ff2e5f64f29fa863e17b9658174644e791a49636..5ee96e9612eb873d9ad4c548e6fb524d52d201b4 100755 (executable)
@@ -67,8 +67,8 @@ then
   aprun=`which aprun 2>/dev/null`
   if test -n "$aprun"
   then
-    echo aprun -n $pes $args
-    $aprun -n $pes $args
+    echo  aprun -n $pes -d `expr $ppn + 1` $args
+    $aprun -n $pes -d `expr $ppn + 1` $args
   else
     mpirun_cmd=`which mpirun 2>/dev/null`
     if test -n "$mpirun_cmd"
index 5baa4b0e02c94bb20a0aa5c3e68386437d545719..7b84b7f6d432c4437974c170b6cd64bfd5513bc9 100644 (file)
@@ -9,7 +9,7 @@
 
 #define CMK_HANDLE_SIGUSR                                  0
 
-#define CMK_MSG_HEADER_EXT_    CmiUInt8 size; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root; 
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt4 seq; CmiUInt2 rank,hdl,xhdl,info,stratid,redID; CmiInt4 root; 
 
 #define CMK_MSG_HEADER_BASIC  CMK_MSG_HEADER_EXT
 #define CMK_MSG_HEADER_EXT    { CMK_MSG_HEADER_EXT_ }
@@ -33,7 +33,7 @@
 
 #define NODE_0_IS_CONVHOST                                 1
 
-#define CMK_IMMEDIATE_MSG                                 0
+#define CMK_IMMEDIATE_MSG                                 1
 #define CMK_MACHINE_PROGRESS_DEFINED                       1
 
 #define CMK_LB_CPUTIMER                                           0
diff --git a/src/arch/gemini_gni/conv-mach-syncft.h b/src/arch/gemini_gni/conv-mach-syncft.h
new file mode 100644 (file)
index 0000000..5cb052b
--- /dev/null
@@ -0,0 +1,15 @@
+
+#undef CMK_MSG_HEADER_EXT_
+//#undef CMK_MSG_HEADER_EXT
+//#undef CMK_MSG_HEADER_BIGSIM_
+/* expand the header to store the restart phase counter(pn) */
+#define CMK_MSG_HEADER_EXT_    CmiUInt4 size; CmiUInt4 seq; CmiUInt2 rank,hdl,xhdl,info,stratid,redID,pn,d9; CmiInt4 root; 
+//#define CMK_MSG_HEADER_EXT    { CMK_MSG_HEADER_EXT_ }
+//#define CMK_MSG_HEADER_BIGSIM_    { CmiUInt2 d0,d1,d2,d3,d4,d5,hdl,xhdl,pn,info; int nd, n; double rt; CmiInt2 tID; CmiUInt2 hID; char t; int msgID; int srcPe;}
+//#define CMK_MSG_HEADER_BIGSIM_  { CMK_MSG_HEADER_EXT_ CMK_BIGSIM_FIELDS }
+
+#define CmiGetRestartPhase(m)       ((((CmiMsgHeaderExt*)m)->pn))
+
+#define __FAULT__                                         1
+
+#define CMK_MEM_CHECKPOINT                                1
diff --git a/src/arch/gemini_gni/conv-mach-syncft.sh b/src/arch/gemini_gni/conv-mach-syncft.sh
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/arch/gemini_gni/cray_tlbhack.c b/src/arch/gemini_gni/cray_tlbhack.c
new file mode 100644 (file)
index 0000000..d19c23b
--- /dev/null
@@ -0,0 +1,6 @@
+#include <unistd.h>
+
+int gethugepagesize()
+{
+    return getpagesize();
+}
diff --git a/src/arch/gemini_gni/machine-cmidirect.c b/src/arch/gemini_gni/machine-cmidirect.c
new file mode 100644 (file)
index 0000000..b79e75b
--- /dev/null
@@ -0,0 +1,243 @@
+/** @file
+ * uGNI cmiDirect communication
+ * @ingroup Machine
+*/
+
+/*
+  included in machine.c
+  Yanhua Sun, 2/5/2012
+*/
+
+#define     CMI_DIRECT_DEBUG    0
+#include "cmidirect.h"
+
+CmiDirectMemoryHandler CmiDirect_registerMemory(void *buff, int size)
+{
+    CmiDirectMemoryHandler mem_hndl; 
+    gni_return_t        status;
+    MEMORY_REGISTER(onesided_hnd, nic_hndl, buff, size, &mem_hndl, &omdh, status);
+    GNI_RC_CHECK("cmidirect register memory fails\n", status);
+    return mem_hndl;
+}
+static void printHandle(CmiDirectUserHandle *userHandle, char *s)
+{
+    CmiPrintf( "[%d]%s(%p)(%p,%p,%p)==>(%p,%p,%p)(%d)(%p,%p)\n", CmiMyPe(), s, userHandle, userHandle->localBuf, userHandle->localMdh.qword1, userHandle->localMdh.qword2, 
+        userHandle->remoteBuf, userHandle->remoteMdh.qword1, userHandle->remoteMdh.qword2, userHandle->transSize, userHandle->callbackFnPtr, userHandle->callbackData );
+}
+
+struct infiDirectUserHandle CmiDirect_createHandle_mem(CmiDirectMemoryHandler *mem_hndl, void *recvBuf, int recvBufSize, void (*callbackFnPtr)(void *), void *callbackData)
+{
+    gni_return_t            status = GNI_RC_SUCCESS;
+    CmiDirectUserHandle userHandle;
+    userHandle.handle=1; 
+    userHandle.remoteNode= CmiMyNode();
+    userHandle.remoteRank = CmiMyRank();
+    userHandle.transSize=recvBufSize;
+    userHandle.remoteBuf=recvBuf;
+    userHandle.callbackFnPtr=callbackFnPtr;
+    userHandle.callbackData=callbackData;
+    userHandle.remoteMdh = *mem_hndl;
+    userHandle.initialValue=0;
+#if CMI_DIRECT_DEBUG
+    //printHandle(&userHandle, "Create Handler");
+#endif
+    return userHandle;
+
+}
+/**
+ To be called on the receiver to create a handle and return its number
+**/
+CmiDirectUserHandle CmiDirect_createHandle(int localNode,void *recvBuf, int recvBufSize, void (*callbackFnPtr)(void *), void *callbackData,double initialValue) {
+
+    gni_return_t            status = GNI_RC_SUCCESS;
+    CmiDirectUserHandle userHandle;
+    userHandle.handle=1; 
+    userHandle.localNode=localNode;
+    userHandle.remoteNode= CmiMyNode();
+    userHandle.transSize=recvBufSize;
+    userHandle.remoteBuf=recvBuf;
+    userHandle.initialValue=initialValue;
+    userHandle.callbackFnPtr=callbackFnPtr;
+    userHandle.callbackData=callbackData;
+
+    if(recvBufSize <= SMSG_MAX_MSG)
+    {
+        MEMORY_REGISTER(onesided_hnd, nic_hndl, userHandle.remoteBuf, recvBufSize, &(userHandle.remoteMdh), &omdh, status);
+    }
+    else if(IsMemHndlZero((GetMemHndl(userHandle.remoteBuf)))){
+        //status = registerMempool(userHandle.remoteBuf);
+        userHandle.remoteMdh = GetMemHndl(userHandle.remoteBuf);
+    } else
+        userHandle.remoteMdh = GetMemHndl(userHandle.remoteBuf);
+    if(status != GNI_RC_SUCCESS) {
+        userHandle.remoteMdh.qword1 = 0;
+        userHandle.remoteMdh.qword2 = 0;
+    }
+
+#if CMI_DIRECT_DEBUG
+    //printHandle(&userHandle, "Create Handler");
+#endif
+    return userHandle;
+}
+
+void CmiDirect_saveHandler(CmiDirectUserHandle* h, void *ptr)
+{
+    h->remoteHandler = ptr;
+}
+
+void CmiDirect_assocLocalBuffer_mem(CmiDirectUserHandle *userHandle, CmiDirectMemoryHandler *mem_hndl, void *sendBuf,int sendBufSize) {
+    gni_return_t            status = GNI_RC_SUCCESS;
+    
+    userHandle->localNode=CmiMyNode();
+    userHandle->localBuf=sendBuf;
+
+    userHandle->localMdh = *mem_hndl;
+#if CMI_DIRECT_DEBUG
+    printHandle(userHandle, "Associate Handler");
+#endif
+}
+/****
+ To be called on the local to attach the local's buffer to this handle
+******/
+
+void CmiDirect_assocLocalBuffer(CmiDirectUserHandle *userHandle,void *sendBuf,int sendBufSize) {
+
+    /* one-sided primitives would require registration of memory */
+    gni_return_t            status = GNI_RC_SUCCESS;
+    
+    userHandle->localNode=CmiMyNode();
+    userHandle->localBuf=sendBuf;
+
+    if(userHandle->transSize <= SMSG_MAX_MSG)
+    {
+        MEMORY_REGISTER(onesided_hnd, nic_hndl, userHandle->localBuf, userHandle->transSize, &userHandle->localMdh, &omdh, status);
+    }
+    else if(IsMemHndlZero((GetMemHndl(userHandle->localBuf)))){
+        //status = registerMempool(userHandle->localBuf);
+        userHandle->localMdh = GetMemHndl(userHandle->localBuf);
+    } else
+        userHandle->localMdh = GetMemHndl(userHandle->localBuf);
+   
+    if(status != GNI_RC_SUCCESS) {
+        userHandle->localMdh.qword1 = 0;
+        userHandle->localMdh.qword2 = 0;
+    }
+
+#if CMI_DIRECT_DEBUG
+    printHandle(userHandle, "Associate Handler");
+#endif
+}
+
+/****
+To be called on the local to do the actual data transfer
+******/
+void CmiDirect_put(CmiDirectUserHandle *userHandle) {
+
+    gni_post_descriptor_t *pd;
+
+#if USE_LRTS_MEMPOOL
+    if (userHandle->remoteNode== CmiMyNode()) {
+        CmiMemcpy(userHandle->remoteBuf,userHandle->localBuf,userHandle->transSize);
+        (*(userHandle->callbackFnPtr))(userHandle->callbackData);
+    } else {
+        gni_return_t status;
+        RDMA_REQUEST        *rdma_request_msg;
+        MallocPostDesc(pd);
+        if(userHandle->transSize <= LRTS_GNI_RDMA_THRESHOLD)
+            pd->type            = GNI_POST_FMA_PUT;
+        else
+            pd->type            = GNI_POST_RDMA_PUT;
+        pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
+        pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+        pd->length          = userHandle->transSize;
+        pd->local_addr      = (uint64_t) (userHandle->localBuf);
+        pd->local_mem_hndl  = userHandle->localMdh; 
+        pd->remote_addr     = (uint64_t)(userHandle->remoteBuf);
+        pd->remote_mem_hndl = userHandle->remoteMdh;
+        pd->src_cq_hndl     = 0;
+        pd->rdma_mode       = 0;
+        pd->first_operand   = (uint64_t)(userHandle->remoteHandler);
+        pd->amo_cmd         = 1;
+        pd->cqwrite_value   = 1;        
+        bufferRdmaMsg(userHandle->remoteNode, pd); 
+#if CMI_DIRECT_DEBUG
+        printHandle(userHandle, "After Direct_put");
+        CmiPrintf("[%d] RDMA put %d,%d bytes addr %p to remoteNode %d:%p \n\n",CmiMyPe(), userHandle->transSize, pd->length, (void*)(pd->local_addr), userHandle->remoteNode, (void*) (pd->remote_addr));
+#endif
+    }
+#else
+    CmiPrintf("Normal Send in CmiDirect Put\n");
+    CmiAbort("");
+#endif
+
+
+}
+
+// needs to figure out what is local/remote
+void CmiDirect_get(CmiDirectUserHandle *userHandle) {
+
+    gni_post_descriptor_t *pd;
+
+#if USE_LRTS_MEMPOOL
+    if (userHandle->remoteNode== CmiMyNode()) {
+        CmiMemcpy(userHandle->remoteBuf,userHandle->localBuf,userHandle->transSize);
+        (*(userHandle->callbackFnPtr))(userHandle->callbackData);
+    } else {
+        gni_return_t status;
+        RDMA_REQUEST        *rdma_request_msg;
+        MallocPostDesc(pd);
+        if(userHandle->transSize <= LRTS_GNI_RDMA_THRESHOLD)
+            pd->type            = GNI_POST_FMA_GET;
+        else
+            pd->type            = GNI_POST_RDMA_GET;
+        pd->cq_mode         = GNI_CQMODE_GLOBAL_EVENT;
+        pd->dlvr_mode       = GNI_DLVMODE_PERFORMANCE;
+        pd->length          = userHandle->transSize;
+        pd->local_addr      = (uint64_t) (userHandle->localBuf);
+        pd->local_mem_hndl  = userHandle->localMdh; 
+        pd->remote_addr     = (uint64_t)(userHandle->remoteBuf);
+        pd->remote_mem_hndl = userHandle->remoteMdh;
+        pd->src_cq_hndl     = 0;
+        pd->rdma_mode       = 0;
+        pd->first_operand   = (uint64_t) (userHandle->callbackFnPtr);
+        pd->second_operand  = (uint64_t) (userHandle->callbackData);
+        pd->amo_cmd         = 2;
+        pd->cqwrite_value   = 1;
+        bufferRdmaMsg(userHandle->remoteNode, pd); 
+#if CMI_DIRECT_DEBUG
+    CmiPrintf("[%d] RDMA get %d,%d bytes addr %p to remoteNode %d:%p \n\n",CmiMyPe(), userHandle->transSize, pd->length, (void*)(pd->local_addr), userHandle->remoteNode, (void*) (pd->remote_addr));
+#endif
+    }
+#else
+    CmiPrintf("Normal Send in CmiDirect Get\n");
+    CmiAbort("");
+#endif
+
+
+
+}
+
+/**** up to the user to safely call this */
+void CmiDirect_deassocLocalBuffer(CmiDirectUserHandle *userHandle) {
+
+
+}
+
+/**** up to the user to safely call this */
+void CmiDirect_destroyHandle(CmiDirectUserHandle *userHandle) {
+    free(userHandle);
+}
+
+/**** Should not be called the first time *********/
+void CmiDirect_ready(CmiDirectUserHandle *userHandle) {
+}
+
+/**** Should not be called the first time *********/
+void CmiDirect_readyPollQ(CmiDirectUserHandle *userHandle) {
+}
+
+/**** Should not be called the first time *********/
+void CmiDirect_readyMark(CmiDirectUserHandle *userHandle) {
+}
+
index 984ed0a3ee51d050d4316ccef61eb264ca5e8185..9d368c0bc0595021778073b8f24f0a5403bd1e55 100644 (file)
@@ -1,12 +1,29 @@
-/*****************************************************************************
- * $Source$
- * $Author$  Yanhua Sun
- * $Date$  07-01-2011
- * $Revision$ 
- *****************************************************************************/
 
 /** @file
  * Gemini GNI machine layer
+ *
+ * Author:   Yanhua Sun
+             Gengbin Zheng
+ * Date:   07-01-2011
+ *
+ *  Flow control by mem pool using environment variables:
+
+    # CHARM_UGNI_MEMPOOL_MAX can be maximum_register_mem/number_of_processes
+    # CHARM_UGNI_SEND_MAX can be half of CHARM_UGNI_MEMPOOL_MAX
+    export CHARM_UGNI_MEMPOOL_INIT_SIZE=8M
+    export CHARM_UGNI_MEMPOOL_MAX=20M
+    export CHARM_UGNI_SEND_MAX=10M
+
+    # limit on total mempool size allocated, this is to prevent mempool
+    # uses too much memory
+    export CHARM_UGNI_MEMPOOL_SIZE_LIMIT=512M 
+
+    other environment variables:
+
+    export CHARM_UGNI_NO_DEADLOCK_CHECK=yes    # disable checking deadlock
+    export CHARM_UGNI_MAX_MEMORY_ON_NODE=0.8G  # max memory per node for mempool
+    export CHARM_UGNI_BIG_MSG_SIZE=4M          # set big message size protocol
+    export CHARM_UGNI_BIG_MSG_PIPELINE_LEN=4   # set big message pipe len
  */
 /*@{*/
 
 #include <stdint.h>
 #include <errno.h>
 #include <malloc.h>
+#include <unistd.h>
+#include <time.h>
+#include <gni_pub.h>
+#include <pmi.h>
 
-#include "gni_pub.h"
-#include "pmi.h"
+//#include <numatoolkit.h>
 
 #include "converse.h"
 
-/*Support for ++debug: */
-#if defined(_WIN32) && ! defined(__CYGWIN__)
-#include <windows.h>
-#include <wincon.h>
-#include <sys/types.h>
-#include <sys/timeb.h>
+#define     LARGEPAGE           0
 
-static void sleep(int secs) {
-    Sleep(1000*secs);
-}
-#else
-#include <unistd.h> /*For getpid()*/
+#if LARGEPAGE
+#include <hugetlbfs.h>
+#endif
+
+#if CMK_DIRECT
+#include "cmidirect.h"
 #endif
+#define PRINT_SYH  0
 
+#define USE_LRTS_MEMPOOL                  1
 
 #define REMOTE_EVENT                      0
-#define USE_LRTS_MEMPOOL                  1
 
-#if USE_LRTS_MEMPOOL
+#define CMI_EXERT_SEND_CAP     0
+#define        CMI_EXERT_RECV_CAP      0
+
+#if CMI_EXERT_SEND_CAP
+#define SEND_CAP 16
+#endif
+
+#if CMI_EXERT_RECV_CAP
+#define RECV_CAP 2
+#endif
+
 #if CMK_SMP
-#define STEAL_MEMPOOL                     0
+#define COMM_THREAD_SEND 1
+//#define MULTI_THREAD_SEND 1
 #endif
 
-#define oneMB (1024ll*1024)
 #if CMK_SMP
-static CmiInt8 _mempool_size = 8*oneMB;
+#define PIGGYBACK_ACK                        0
+#endif
+
+// Trace communication thread
+#if CMK_TRACE_ENABLED && CMK_SMP_TRACE_COMMTHREAD
+#define TRACE_THRESHOLD     0.00005
+#define CMI_MPI_TRACE_MOREDETAILED 0
+#undef CMI_MPI_TRACE_USEREVENTS
+#define CMI_MPI_TRACE_USEREVENTS 1
+#else
+#undef CMK_SMP_TRACE_COMMTHREAD
+#define CMK_SMP_TRACE_COMMTHREAD 0
+#endif
+
+#define CMK_TRACE_COMMOVERHEAD 0
+#if CMK_TRACE_ENABLED && CMK_TRACE_COMMOVERHEAD
+#undef CMI_MPI_TRACE_USEREVENTS
+#define CMI_MPI_TRACE_USEREVENTS 1
+#else
+#undef CMK_TRACE_COMMOVERHEAD
+#define CMK_TRACE_COMMOVERHEAD 0
+#endif
+
+#if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && ! CMK_TRACE_IN_CHARM
+CpvStaticDeclare(double, projTraceStart);
+#define  START_EVENT()  CpvAccess(projTraceStart) = CmiWallTimer();
+#define  END_EVENT(x)   traceUserBracketEvent(x, CpvAccess(projTraceStart), CmiWallTimer());
 #else
-static CmiInt8 _mempool_size = 32*oneMB;
+#define  START_EVENT()
+#define  END_EVENT(x)
 #endif
+
+#if USE_LRTS_MEMPOOL
+
+#define oneMB (1024ll*1024)
+#define oneGB (1024ll*1024*1024)
+
+static CmiInt8 _mempool_size = 8*oneMB;
 static CmiInt8 _expand_mem =  4*oneMB;
+static CmiInt8 _mempool_size_limit = 0;
+
+static CmiInt8 _totalmem = 0.8*oneGB;
+
+#if LARGEPAGE
+static int BIG_MSG  =  16*oneMB;
+static int ONE_SEG  =  4*oneMB;
+#else
+static int BIG_MSG  =  4*oneMB;
+static int ONE_SEG  =  2*oneMB;
 #endif
+static int BIG_MSG_PIPELINE = 4;
 
-#define BIG_MSG       4*oneMB
-#define ONE_SEG       8*oneMB
+// dynamic flow control
+static CmiInt8 buffered_send_msg = 0;
+static CmiInt8 register_memory_size = 0;
 
-#define PRINT_SYH  0
-#if CMK_SMP
-#define COMM_THREAD_SEND 1
+#if LARGEPAGE
+static CmiInt8  MAX_BUFF_SEND  =  100000*oneMB;
+static CmiInt8  MAX_REG_MEM    =  200000*oneMB;
+static CmiInt8 register_count = 0;
+#else
+#if CMK_SMP && COMM_THREAD_SEND 
+static CmiInt8  MAX_BUFF_SEND  =  100*oneMB;
+static CmiInt8  MAX_REG_MEM    =  200*oneMB;
+#else
+static CmiInt8  MAX_BUFF_SEND  =  16*oneMB;
+static CmiInt8  MAX_REG_MEM    =  25*oneMB;
+#endif
+
+
+#endif
+
+#endif     /* end USE_LRTS_MEMPOOL */
+
+#if CMK_SMP && MULTI_THREAD_SEND
+#define     CMI_GNI_LOCK        CmiLock(tx_cq_lock);
+#define     CMI_GNI_UNLOCK        CmiUnlock(tx_cq_lock);
+#else
+#define     CMI_GNI_LOCK
+#define     CMI_GNI_UNLOCK
 #endif
+
+static int _tlbpagesize = 4096;
+
+//static int _smpd_count  = 0;
+
+static int   user_set_flag  = 0;
+
+static int _checkProgress = 1;             /* check deadlock */
+static int _detected_hang = 0;
+
+#define             SMSG_ATTR_SIZE      sizeof(gni_smsg_attr_t)
+
+// dynamic SMSG
+static int useDynamicSMSG  =0;               /* dynamic smsgs setup */
+
+static int avg_smsg_connection = 32;
+static int                 *smsg_connected_flag= 0;
+static gni_smsg_attr_t     **smsg_attr_vector_local;
+static gni_smsg_attr_t     **smsg_attr_vector_remote;
+static gni_ep_handle_t     ep_hndl_unbound;
+static gni_smsg_attr_t     send_smsg_attr;
+static gni_smsg_attr_t     recv_smsg_attr;
+
+typedef struct _dynamic_smsg_mailbox{
+   void     *mailbox_base;
+   int      size;
+   int      offset;
+   gni_mem_handle_t  mem_hndl;
+   struct      _dynamic_smsg_mailbox  *next;
+}dynamic_smsg_mailbox_t;
+
+static dynamic_smsg_mailbox_t  *mailbox_list;
+
 int         rdma_id = 0;
+
+static CmiUInt8  smsg_send_count = 0,  last_smsg_send_count = 0;
+static CmiUInt8  smsg_recv_count = 0,  last_smsg_recv_count = 0;
+
 #if PRINT_SYH
-int         lrts_smsg_success = 0;
 int         lrts_send_msg_id = 0;
-int         lrts_send_rdma_success = 0;
-int         lrts_received_msg = 0;
 int         lrts_local_done_msg = 0;
+int         lrts_send_rdma_success = 0;
 #endif
 
 #include "machine.h"
@@ -92,17 +221,48 @@ onesided_md_t    omdh;
 #else
 uint8_t   onesided_hnd, omdh;
 #if REMOTE_EVENT
-#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh) GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, smsg_rx_cqh,  GNI_MEM_READWRITE, -1, mem_hndl)
+#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh, status)    if(register_memory_size+size>= MAX_REG_MEM) { \
+         status = GNI_RC_ERROR_NOMEM;} \
+        else {status = GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, smsg_rx_cqh,  GNI_MEM_READWRITE, -1, mem_hndl); \
+                if(status == GNI_RC_SUCCESS) register_memory_size += size; }  
 #else
-#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh) GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, NULL,  GNI_MEM_READWRITE, -1, mem_hndl)
-#endif
-#define  MEMORY_DEREGISTER(handler, nic_hndl, mem_hndl, myomdh)  GNI_MemDeregister(nic_hndl, (mem_hndl))
-#endif
-
-#define GetMemHndl(x)  ((mempool_header*)((char*)x-ALIGNBUF))->mem_hndl
-
-#define CmiGetMsgSize(m)  ((CmiMsgHeaderExt*)m)->size
-#define CmiSetMsgSize(m,s)  ((((CmiMsgHeaderExt*)m)->size)=(s))
+#define  MEMORY_REGISTER(handler, nic_hndl, msg, size, mem_hndl, myomdh, status ) \
+    do {   \
+        if (register_memory_size + size >= MAX_REG_MEM) { \
+            status = GNI_RC_ERROR_NOMEM; \
+        } else { status = GNI_MemRegister(nic_hndl, (uint64_t)msg,  (uint64_t)size, NULL,  GNI_MEM_READWRITE, -1, mem_hndl); \
+            if(status == GNI_RC_SUCCESS) register_memory_size += size; } \
+    } while(0)
+#endif
+#define  MEMORY_DEREGISTER(handler, nic_hndl, mem_hndl, myomdh, size)  \
+    do { if (GNI_MemDeregister(nic_hndl, (mem_hndl) ) == GNI_RC_SUCCESS) \
+             register_memory_size -= size; \
+         else CmiAbort("MEM_DEregister");  \
+    } while (0)
+#endif
+
+#define   GetMempoolBlockPtr(x)  (((mempool_header*)((char*)(x)-ALIGNBUF))->block_ptr)
+#define   IncreaseMsgInRecv(x)   (GetMempoolBlockPtr(x)->msgs_in_recv)++
+#define   DecreaseMsgInRecv(x)   (GetMempoolBlockPtr(x)->msgs_in_recv)--
+#define   IncreaseMsgInSend(x)   (GetMempoolBlockPtr(x)->msgs_in_send)++
+#define   DecreaseMsgInSend(x)   (GetMempoolBlockPtr(x)->msgs_in_send)--
+#define   GetMempoolPtr(x)        GetMempoolBlockPtr(x)->mptr
+#define   GetMempoolsize(x)       GetMempoolBlockPtr(x)->size
+#define   GetMemHndl(x)           GetMempoolBlockPtr(x)->mem_hndl
+#define   NoMsgInSend(x)          GetMempoolBlockPtr(x)->msgs_in_send == 0
+#define   NoMsgInRecv(x)          GetMempoolBlockPtr(x)->msgs_in_recv == 0
+#define   NoMsgInFlight(x)        (GetMempoolBlockPtr(x)->msgs_in_send + GetMempoolBlockPtr(x)->msgs_in_recv  == 0)
+#define   IsMemHndlZero(x)        ((x).qword1 == 0 && (x).qword2 == 0)
+#define   SetMemHndlZero(x)       do {(x).qword1 = 0;(x).qword2 = 0;} while (0)
+#define   NotRegistered(x)        IsMemHndlZero(((block_header*)x)->mem_hndl)
+
+#define   GetMemHndlFromBlockHeader(x) ((block_header*)x)->mem_hndl
+#define   GetSizeFromBlockHeader(x)    ((block_header*)x)->size
+
+#define CmiGetMsgSize(m)     ((CmiMsgHeaderExt*)m)->size
+#define CmiSetMsgSize(m,s)   ((((CmiMsgHeaderExt*)m)->size)=(s))
+#define CmiGetMsgSeq(m)      ((CmiMsgHeaderExt*)m)->seq
+#define CmiSetMsgSeq(m, s)   ((((CmiMsgHeaderExt*)m)->seq) = (s))
 
 #define ALIGNBUF                64
 
@@ -111,30 +271,32 @@ uint8_t   onesided_hnd, omdh;
 
 #define FMA_PER_CORE  1024
 #define FMA_BUFFER_SIZE 1024
+
 /* If SMSG is used */
 static int  SMSG_MAX_MSG = 1024;
-//static int  log2_SMSG_MAX_MSG;
-#define SMSG_MAX_CREDIT  36
+#define SMSG_MAX_CREDIT 72 
 
 #define MSGQ_MAXSIZE       2048
 /* large message transfer with FMA or BTE */
-#define LRTS_GNI_RDMA_THRESHOLD  2048
-//2048
+#define LRTS_GNI_RDMA_THRESHOLD  1024 
 
-#define REMOTE_QUEUE_ENTRIES  20480 
-#define LOCAL_QUEUE_ENTRIES   20480 
+#if CMK_SMP
+static int  REMOTE_QUEUE_ENTRIES=163840; 
+static int LOCAL_QUEUE_ENTRIES=163840; 
+#else
+static int  REMOTE_QUEUE_ENTRIES=20480;
+static int LOCAL_QUEUE_ENTRIES=20480; 
+#endif
 
-#define BIG_MSG_TAG  0x26
-#define PUT_DONE_TAG      0x29
-#define ACK_TAG           0x30
+#define BIG_MSG_TAG             0x26
+#define PUT_DONE_TAG            0x28
+#define DIRECT_PUT_DONE_TAG     0x29
+#define ACK_TAG                 0x30
 /* SMSG is data message */
 #define SMALL_DATA_TAG          0x31
+#define SMALL_DATA_ACK_TAG      0x32
 /* SMSG is a control message to initialize a BTE */
-#define MEDIUM_HEAD_TAG         0x32
-#define MEDIUM_DATA_TAG         0x33
-#define LMSG_INIT_TAG     0x39 
-#define VERY_LMSG_INIT_TAG     0x40 
-#define VERY_LMSG_TAG     0x41 
+#define LMSG_INIT_TAG           0x39 
 
 #define DEBUG
 #ifdef GNI_RC_CHECK
@@ -148,9 +310,8 @@ static int  SMSG_MAX_MSG = 1024;
 
 #define ALIGN64(x)       (size_t)((~63)&((x)+63))
 //#define ALIGN4(x)        (size_t)((~3)&((x)+3)) 
+#define ALIGNHUGEPAGE(x)   (size_t)((~(_tlbpagesize-1))&((x)+_tlbpagesize-1))
 
-#define     useDynamicSMSG    0
-//static int useDynamicSMSG   = 1;
 static int useStaticMSGQ = 0;
 static int useStaticFMA = 0;
 static int mysize, myrank;
@@ -169,10 +330,7 @@ typedef struct mdh_addr_list{
 }mdh_addr_list_t;
 
 static unsigned int         smsg_memlen;
-#define     SMSG_CONN_SIZE     sizeof(gni_smsg_attr_t)
 gni_smsg_attr_t    **smsg_local_attr_vec = 0;
-int                 *smsg_connected_flag= 0;
-char                *smsg_connection_addr = 0;
 mdh_addr_t          setup_mem;
 mdh_addr_t          *smsg_connection_vec = 0;
 gni_mem_handle_t    smsg_connection_memhndl;
@@ -187,38 +345,6 @@ gni_msgq_handle_t       msgq_handle;
 gni_msgq_ep_attr_t      msgq_ep_attrs;
 gni_msgq_ep_attr_t      msgq_ep_attrs_size;
 
-
-
-/* preallocated DMA buffer */
-int                     DMA_slots;
-uint64_t                DMA_avail_tag = 0;
-uint32_t                DMA_incoming_avail_tag = 0;
-uint32_t                DMA_outgoing_avail_tag = 0;
-void                    *DMA_incoming_base_addr;
-void                    *DMA_outgoing_base_addr;
-mdh_addr_t              DMA_buffer_base_mdh_addr;
-mdh_addr_t              *DMA_buffer_base_mdh_addr_vec;
-int                     DMA_buffer_size;
-int                     DMA_max_single_msg = 131072;//524288 ;
-
-#define                 DMA_SIZE_PER_SLOT       8192
-
-
-typedef struct dma_msgid_map
-{
-    uint64_t     msg_id;
-    int     msg_subid;
-} dma_msgid_map_t;
-
-dma_msgid_map_t         *dma_map_list;
-
-typedef struct msg_trace
-{
-    uint64_t    msg_id;
-    int         done_num;
-}msg_trace_t;
-
-msg_trace_t             *pending_msg_list;
 /* =====Beginning of Declarations of Machine Specific Variables===== */
 static int cookie;
 static int modes = 0;
@@ -227,84 +353,135 @@ static gni_cq_handle_t       smsg_tx_cqh = NULL;
 static gni_cq_handle_t       post_rx_cqh = NULL;
 static gni_cq_handle_t       post_tx_cqh = NULL;
 static gni_ep_handle_t       *ep_hndl_array;
-
+#if CMK_SMP && MULTI_THREAD_SEND
+static CmiNodeLock           *ep_lock_array;
+static CmiNodeLock           tx_cq_lock; 
+static CmiNodeLock           rx_cq_lock;
+static CmiNodeLock           *mempool_lock;
+#endif
 
 typedef struct msg_list
 {
     uint32_t destNode;
     uint32_t size;
     void *msg;
-    struct msg_list *next;
     uint8_t tag;
+#if !CMK_SMP
+    struct msg_list *next;
+#endif
 }MSG_LIST;
 
-typedef struct medium_msg_list
-{
-    uint32_t destNode;
-    uint32_t msg_id;
-    uint32_t msg_subid;
-    uint32_t remain_size;
-    void *msg;
-    struct medium_msg_list *next;
-}MEDIUM_MSG_LIST;
-
 
 typedef struct control_msg
 {
-    uint64_t            source_addr;
-    uint64_t            dest_addr;
-    int                 source;               /* source rank */
-    int                 length;
-    int                 seq_id;                 //big message   -1 meaning single message
+    uint64_t            source_addr;    /* address from the start of buffer  */
+    uint64_t            dest_addr;      /* address from the start of buffer */
+    int                 total_length;   /* total length */
+    int                 length;         /* length of this packet */
+    uint8_t             seq_id;         //big message   0 meaning single message
     gni_mem_handle_t    source_mem_hndl;
     struct control_msg *next;
-}CONTROL_MSG;
+} CONTROL_MSG;
+
+#define CONTROL_MSG_SIZE       (sizeof(CONTROL_MSG)-sizeof(void*))
+
+typedef struct ack_msg
+{
+    uint64_t            source_addr;    /* address from the start of buffer  */
+#if ! USE_LRTS_MEMPOOL
+    gni_mem_handle_t    source_mem_hndl;
+    int                 length;          /* total length */
+#endif
+    struct ack_msg     *next;
+} ACK_MSG;
+
+#define ACK_MSG_SIZE       (sizeof(ACK_MSG)-sizeof(void*))
 
-typedef struct medium_msg_control
+#if CMK_DIRECT
+typedef struct{
+    uint64_t    handler_addr;
+}CMK_DIRECT_HEADER;
+
+typedef struct {
+    char core[CmiMsgHeaderSizeBytes];
+    uint64_t handler;
+}cmidirectMsg;
+
+//SYH
+CpvDeclare(int, CmiHandleDirectIdx);
+void CmiHandleDirectMsg(cmidirectMsg* msg)
+{
+
+    CmiDirectUserHandle *_handle= (CmiDirectUserHandle*)(msg->handler);
+   (*(_handle->callbackFnPtr))(_handle->callbackData);
+   CmiFree(msg);
+}
+
+void CmiDirectInit()
 {
-    uint64_t            dma_offset;     //the dma_buffer for this block of msg
-    int                 msg_id;         //Id for the total index
-    int                 msg_subid;      //offset inside the message id 
-}MEDIUM_MSG_CONTROL;
+    CpvInitialize(int,  CmiHandleDirectIdx);
+    CpvAccess(CmiHandleDirectIdx) = CmiRegisterHandler( (CmiHandler) CmiHandleDirectMsg);
+}
 
+#endif
 typedef struct  rmda_msg
 {
     int                   destNode;
     gni_post_descriptor_t *pd;
+#if !CMK_SMP
     struct  rmda_msg      *next;
+#endif
 }RDMA_REQUEST;
 
+
+#if CMK_SMP
+#define SMP_LOCKS               0
+#define ONE_SEND_QUEUE                  0
 PCQueue sendRdmaBuf;
+typedef struct  msg_list_index
+{
+    PCQueue     sendSmsgBuf;
+    int         pushed;
+    CmiNodeLock   lock;
+} MSG_LIST_INDEX;
+char                *destpe_avail;
+#if  !ONE_SEND_QUEUE && SMP_LOCKS
+    PCQueue     nonEmptyQueues;
+#endif
+#else         /* non-smp */
 
+static RDMA_REQUEST        *sendRdmaBuf = 0;
+static RDMA_REQUEST        *sendRdmaTail = 0;
 typedef struct  msg_list_index
 {
     int         next;
-    PCQueue     sendSmsgBuf;
-    //MSG_LIST    *head;
-    //MSG_LIST    *tail;
+    MSG_LIST    *sendSmsgBuf;
+    MSG_LIST    *tail;
 } MSG_LIST_INDEX;
 
-/* reuse PendingMsg memory */
-static CONTROL_MSG          *control_freelist=0;
-static MSG_LIST             *msglist_freelist=0;
-static int                  smsg_head_index;
-static MSG_LIST_INDEX       *smsg_msglist_index= 0;
-static MSG_LIST             *smsg_free_head=0;
-static MSG_LIST             *smsg_free_tail=0;
-
-/*
-#define FreeMsgList(msg_head, msg_tail, free_head, free_tail)       \
-    if(free_head == 0)  free_head = free_tail = msg_head;    \
-    else   free_tail = free_tail->next;    \
-    if( msg_head->next == msg_tail) msg_head =0;   \
-    else msg_head= msg_head->next;    
-
-#define MallocMsgList(d, msg_head, msg_tail, free_head, free_tail, msgsize) \
-    if(free_head == 0) {d= malloc(msgsize);  \
-        if(msg_head == 0)   msg_head =msg_tail = msg_head->next = msg_tail->next = d; \
-        else { msg_tail->next = d; d->next = msg_head; msg_tail=d;} \
-    }else {d = free_head; free_head = free_head->next; if(free_tail->next == free_head) free_head =0;} \
-*/
+#endif
+
+// buffered send queue
+#if ! ONE_SEND_QUEUE
+typedef struct smsg_queue
+{
+    MSG_LIST_INDEX   *smsg_msglist_index;
+    int               smsg_head_index;
+} SMSG_QUEUE;
+#else
+typedef struct smsg_queue
+{
+    PCQueue       sendMsgBuf;
+}  SMSG_QUEUE;
+#endif
+
+SMSG_QUEUE                  smsg_queue;
+#if PIGGYBACK_ACK
+SMSG_QUEUE                  smsg_ack_queue;
+#endif
+#if CMK_USE_OOB
+SMSG_QUEUE                  smsg_oob_queue;
+#endif
 
 #if CMK_SMP
 
@@ -313,15 +490,22 @@ static MSG_LIST             *smsg_free_tail=0;
 
 #else
 
+static MSG_LIST       *msglist_freelist=0;
+
 #define FreeMsgList(d)  \
+  do { \
   (d)->next = msglist_freelist;\
-  msglist_freelist = d;
+  msglist_freelist = d; \
+  } while (0)
 
 #define MallocMsgList(d) \
+  do {  \
   d = msglist_freelist;\
   if (d==0) {d = ((MSG_LIST*)malloc(sizeof(MSG_LIST)));\
              _MEMCHECK(d);\
-  } else msglist_freelist = d->next;
+  } else msglist_freelist = d->next; \
+  d->next =0;  \
+  } while (0)
 
 #endif
 
@@ -332,50 +516,77 @@ static MSG_LIST             *smsg_free_tail=0;
 
 #else
 
+static CONTROL_MSG    *control_freelist=0;
+
 #define FreeControlMsg(d)       \
+  do { \
   (d)->next = control_freelist;\
-  control_freelist = d;
+  control_freelist = d; \
+  } while (0);
 
 #define MallocControlMsg(d) \
+  do {  \
   d = control_freelist;\
   if (d==0) {d = ((CONTROL_MSG*)malloc(sizeof(CONTROL_MSG)));\
              _MEMCHECK(d);\
-  } else control_freelist = d->next;
+  } else control_freelist = d->next;  \
+  } while (0);
 
 #endif
 
-static RDMA_REQUEST         *rdma_freelist = NULL;
+#if CMK_SMP
 
-#define FreeMediumControlMsg(d)       \
-  (d)->next = medium_control_freelist;\
-  medium_control_freelist = d;
+#define FreeAckMsg(d)      free(d);
+#define MallocAckMsg(d)    d = ((ACK_MSG*)malloc(sizeof(ACK_MSG)));
 
+#else
+
+static ACK_MSG        *ack_freelist=0;
+
+#define FreeAckMsg(d)       \
+  do { \
+  (d)->next = ack_freelist;\
+  ack_freelist = d; \
+  } while (0)
+
+#define MallocAckMsg(d) \
+  do { \
+  d = ack_freelist;\
+  if (d==0) {d = ((ACK_MSG*)malloc(sizeof(ACK_MSG)));\
+             _MEMCHECK(d);\
+  } else ack_freelist = d->next; \
+  } while (0)
+
+#endif
 
-#define MallocMediumControlMsg(d) \
-    d = medium_control_freelist;\
-    if (d==0) {d = ((MEDIUM_MSG_CONTROL*)malloc(sizeof(MEDIUM_MSG_CONTROL)));\
-    _MEMCHECK(d);\
-} else mediumcontrol_freelist = d->next;
 
 # if CMK_SMP
 #define FreeRdmaRequest(d)       free(d);
 #define MallocRdmaRequest(d)     d = ((RDMA_REQUEST*)malloc(sizeof(RDMA_REQUEST)));   
 #else
 
+static RDMA_REQUEST         *rdma_freelist = NULL;
+
 #define FreeRdmaRequest(d)       \
+  do {  \
   (d)->next = rdma_freelist;\
-  rdma_freelist = d;
+  rdma_freelist = d;    \
+  } while (0)
 
 #define MallocRdmaRequest(d) \
+  do {   \
   d = rdma_freelist;\
   if (d==0) {d = ((RDMA_REQUEST*)malloc(sizeof(RDMA_REQUEST)));\
              _MEMCHECK(d);\
-  } else rdma_freelist = d->next;
+  } else rdma_freelist = d->next; \
+  d->next =0;   \
+  } while (0)
 #endif
+
 /* reuse gni_post_descriptor_t */
 static gni_post_descriptor_t *post_freelist=0;
 
-#if !CMK_SMP
+#if  !CMK_SMP
 #define FreePostDesc(d)       \
     (d)->next_descr = post_freelist;\
     post_freelist = d;
@@ -384,7 +595,8 @@ static gni_post_descriptor_t *post_freelist=0;
   d = post_freelist;\
   if (d==0) { \
      d = ((gni_post_descriptor_t*)malloc(sizeof(gni_post_descriptor_t)));\
-     _MEMCHECK(d);\
+     d->next_descr = 0;\
+      _MEMCHECK(d);\
   } else post_freelist = d->next_descr;
 #else
 
@@ -393,6 +605,7 @@ static gni_post_descriptor_t *post_freelist=0;
 
 #endif
 
+
 /* LrtsSent is called but message can not be sent by SMSGSend because of mailbox full or no credit */
 static int      buffered_smsg_counter = 0;
 
@@ -471,6 +684,7 @@ allgather(void *in,void *out, int len)
 
     free(tmp_buf);
 }
+
 static void
 allgather_2(void *in,void *out, int len)
 {
@@ -569,6 +783,41 @@ static uint32_t get_cookie(void)
     return cookie;
 }
 
+#if LARGEPAGE
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+// size must be _tlbpagesize aligned
+void *my_get_huge_pages(size_t size)
+{
+    char filename[512];
+    int fd;
+    mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+    void *ptr = NULL;
+
+    snprintf(filename, sizeof(filename), "%s/charm_mempool.%d.%d", hugetlbfs_find_path_for_size(_tlbpagesize), getpid(), rand());
+    fd = open(filename, O_RDWR | O_CREAT, mode);
+    if (fd == -1) {
+        CmiAbort("my_get_huge_pages: open filed");
+    }
+    ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+    if (ptr == MAP_FAILED) ptr = NULL;
+//printf("[%d] my_get_huge_pages: %s %d %p\n", myrank, filename, size, ptr);
+    close(fd);
+    unlink(filename);
+    return ptr;
+}
+
+void my_free_huge_pages(void *ptr, int size)
+{
+//printf("[%d] my_free_huge_pages: %p %d\n", myrank, ptr, size);
+    int ret = munmap(ptr, size);
+    if (ret == -1) CmiAbort("munmap failed in my_free_huge_pages");
+}
+
+#endif
+
 /* =====Beginning of Definitions of Message-Corruption Related Macros=====*/
 /* TODO: add any that are related */
 /* =====End of Definitions of Message-Corruption Related Macros=====*/
@@ -586,12 +835,110 @@ void CmiMachineProgressImpl() {
 
 static void SendRdmaMsg();
 static void PumpNetworkSmsg();
-static void PumpLocalSmsgTransactions();
 static void PumpLocalRdmaTransactions();
-static int SendBufferMsg();
+static int SendBufferMsg(SMSG_QUEUE *queue);
+
+#if MACHINE_DEBUG_LOG
+FILE *debugLog = NULL;
+static CmiInt8 buffered_recv_msg = 0;
+int         lrts_smsg_success = 0;
+int         lrts_received_msg = 0;
+#endif
+
+static void sweep_mempool(mempool_type *mptr)
+{
+    int n = 0;
+    block_header *current = &(mptr->block_head);
+
+    printf("[n %d %d] sweep_mempool slot START.\n", myrank, n++);
+    while( current!= NULL) {
+        printf("[n %d %d] sweep_mempool slot %p size: %d (%d %d) %lld %lld.\n", myrank, n++, current, current->size, current->msgs_in_send, current->msgs_in_recv, current->mem_hndl.qword1, current->mem_hndl.qword2);
+        current = current->block_next?(block_header *)((char*)mptr+current->block_next):NULL;
+    }
+    printf("[n %d] sweep_mempool slot END.\n", myrank);
+}
+
+inline
+static  gni_return_t deregisterMemory(mempool_type *mptr, block_header **from)
+{
+    block_header *current = *from;
+
+    //while(register_memory_size>= MAX_REG_MEM)
+    //{
+        while( current!= NULL && ((current->msgs_in_send+current->msgs_in_recv)>0 || IsMemHndlZero(current->mem_hndl) ))
+            current = current->block_next?(block_header *)((char*)mptr+current->block_next):NULL;
+
+        *from = current;
+        if(current == NULL) return GNI_RC_ERROR_RESOURCE;
+        MEMORY_DEREGISTER(onesided_hnd, nic_hndl, &(GetMemHndlFromBlockHeader(current)) , &omdh, GetSizeFromBlockHeader(current));
+        SetMemHndlZero(GetMemHndlFromBlockHeader(current));
+    //}
+    return GNI_RC_SUCCESS;
+}
+
+inline 
+static gni_return_t registerFromMempool(mempool_type *mptr, void *blockaddr, size_t size, gni_mem_handle_t  *memhndl)
+{
+    gni_return_t status = GNI_RC_SUCCESS;
+    //int size = GetMempoolsize(msg);
+    //void *blockaddr = GetMempoolBlockPtr(msg);
+    //gni_mem_handle_t  *memhndl =   &(GetMemHndl(msg));
+   
+    block_header *current = &(mptr->block_head);
+    while(register_memory_size>= MAX_REG_MEM)
+    {
+        status = deregisterMemory(mptr, &current);
+        if (status != GNI_RC_SUCCESS) break;
+    }
+    if(register_memory_size>= MAX_REG_MEM) return status;
+
+    MACHSTATE3(8, "mempool (%lld,%lld,%d) \n", buffered_send_msg, buffered_recv_msg, register_memory_size); 
+    while(1)
+    {
+        MEMORY_REGISTER(onesided_hnd, nic_hndl, blockaddr, size, memhndl, &omdh, status);
+        if(status == GNI_RC_SUCCESS)
+        {
+            break;
+        }
+        else if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR)
+        {
+            CmiAbort("Memory registor for mempool fails\n");
+        }
+        else
+        {
+            status = deregisterMemory(mptr, &current);
+            if (status != GNI_RC_SUCCESS) break;
+        }
+    }; 
+    return status;
+}
+
+inline 
+static gni_return_t registerMemory(void *msg, size_t size, gni_mem_handle_t *t)
+{
+    static int rank = -1;
+    int i;
+    gni_return_t status;
+    mempool_type *mptr1 = CpvAccess(mempool);//mempool_type*)GetMempoolPtr(msg);
+    //mempool_type *mptr1 = (mempool_type*)GetMempoolPtr(msg);
+    mempool_type *mptr;
+
+    status = registerFromMempool(mptr1, msg, size, t);
+    if (status == GNI_RC_SUCCESS) return status;
+#if CMK_SMP 
+    for (i=0; i<CmiMyNodeSize()+1; i++) {
+      rank = (rank+1)%(CmiMyNodeSize()+1);
+      mptr = CpvAccessOther(mempool, rank);
+      if (mptr == mptr1) continue;
+      status = registerFromMempool(mptr, msg, size, t);
+      if (status == GNI_RC_SUCCESS) return status;
+    }
+#endif
+    return  GNI_RC_ERROR_RESOURCE;
+}
 
 inline
-static void buffer_small_msgs(void *msg, int size, int destNode, uint8_t tag)
+static void buffer_small_msgs(SMSG_QUEUE *queue, void *msg, int size, int destNode, uint8_t tag)
 {
     MSG_LIST        *msg_tmp;
     MallocMsgList(msg_tmp);
@@ -599,14 +946,34 @@ static void buffer_small_msgs(void *msg, int size, int destNode, uint8_t tag)
     msg_tmp->size   = size;
     msg_tmp->msg    = msg;
     msg_tmp->tag    = tag;
-    //msg_tmp->next   = 0;
+
 #if !CMK_SMP
-    if (PCQueueEmpty(smsg_msglist_index[destNode].sendSmsgBuf) ) {
-        smsg_msglist_index[destNode].next = smsg_head_index;
-        smsg_head_index = destNode;
+    if (queue->smsg_msglist_index[destNode].sendSmsgBuf == 0 ) {
+        queue->smsg_msglist_index[destNode].next = queue->smsg_head_index;
+        queue->smsg_head_index = destNode;
+        queue->smsg_msglist_index[destNode].tail = queue->smsg_msglist_index[destNode].sendSmsgBuf = msg_tmp;
+    }else
+    {
+        queue->smsg_msglist_index[destNode].tail->next = msg_tmp;
+        queue->smsg_msglist_index[destNode].tail = msg_tmp;
+    }
+#else
+#if ONE_SEND_QUEUE
+    PCQueuePush(queue->sendMsgBuf, (char*)msg_tmp);
+#else
+#if SMP_LOCKS
+    CmiLock(queue->smsg_msglist_index[destNode].lock);
+    if(queue->smsg_msglist_index[destNode].pushed == 0)
+    {
+        PCQueuePush(nonEmptyQueues, (char*)&(queue->smsg_msglist_index[destNode]));
     }
+    PCQueuePush(queue->smsg_msglist_index[destNode].sendSmsgBuf, (char*)msg_tmp);
+    CmiUnlock(queue->smsg_msglist_index[destNode].lock);
+#else
+    PCQueuePush(queue->smsg_msglist_index[destNode].sendSmsgBuf, (char*)msg_tmp);
+#endif
+#endif
 #endif
-    PCQueuePush(smsg_msglist_index[destNode].sendSmsgBuf, (char*)msg_tmp);
 #if PRINT_SYH
     buffered_smsg_counter++;
 #endif
@@ -666,7 +1033,6 @@ static void setup_smsg_connection(int destNode)
     if(status == GNI_RC_ERROR_RESOURCE )
     {
         MallocRdmaRequest(rdma_request_msg);
-        rdma_request_msg->next = 0;
         rdma_request_msg->destNode = destNode;
         rdma_request_msg->pd = pd;
         /* buffer this request */
@@ -677,189 +1043,340 @@ static void setup_smsg_connection(int destNode)
     else
         printf("[%d=%d]OK send post FMA \n", myrank, destNode);
 #endif
-    //GNI_RC_CHECK("SMSG Dynamic link", status);
 }
 
+/* useDynamicSMSG */
 inline 
-static gni_return_t send_smsg_message(int destNode, void *header, int size_header, void *msg, int size, uint8_t tag, int inbuff )
+static void alloc_smsg_attr( gni_smsg_attr_t *local_smsg_attr)
 {
     gni_return_t status = GNI_RC_NOT_DONE;
-    gni_smsg_attr_t      *smsg_attr;
-    gni_post_descriptor_t *pd;
-#if useDynamicSMSG
-    //if(useDynamicSMSG == 1)
-    {
-        if(smsg_connected_flag[destNode] == 0)
-        {
-            //printf("[%d]Init smsg connection\n", CmiMyPe());
-            setup_smsg_connection(destNode);
-            buffer_small_msgs(msg, size, destNode, tag);
-            smsg_connected_flag[destNode] =10;
-            return status;
-        }
-        else  if(smsg_connected_flag[destNode] <20)
-        {
-            if(inbuff == 0)
-                buffer_small_msgs(msg, size, destNode, tag);
-            return status;
-        }
-    }
-#endif
-    //printf("[%d] reach send\n", myrank);
-    if(PCQueueEmpty(smsg_msglist_index[destNode].sendSmsgBuf) || inbuff==1)
+
+    if(mailbox_list->offset == mailbox_list->size)
     {
-        status = GNI_SmsgSendWTag(ep_hndl_array[destNode], header, size_header, msg, size, 0, tag);
-        if(status == GNI_RC_SUCCESS)
-        {
-#if PRINT_SYH
-            lrts_smsg_success++;
-            printf("[%d==>%d] send done%d (msgs=%d)\n", myrank, destNode, lrts_smsg_success, lrts_send_msg_id);
-#endif     
-            return status;
-        }
+        dynamic_smsg_mailbox_t *new_mailbox_entry;
+        new_mailbox_entry = (dynamic_smsg_mailbox_t*)malloc(sizeof(dynamic_smsg_mailbox_t));
+        new_mailbox_entry->size = smsg_memlen*avg_smsg_connection;
+        new_mailbox_entry->mailbox_base = malloc(new_mailbox_entry->size);
+        bzero(new_mailbox_entry->mailbox_base, new_mailbox_entry->size);
+        new_mailbox_entry->offset = 0;
+        
+        status = GNI_MemRegister(nic_hndl, (uint64_t)new_mailbox_entry->mailbox_base,
+            new_mailbox_entry->size, smsg_rx_cqh,
+            GNI_MEM_READWRITE,   
+            -1,
+            &(new_mailbox_entry->mem_hndl));
+
+        GNI_RC_CHECK("register", status);
+        new_mailbox_entry->next = mailbox_list;
+        mailbox_list = new_mailbox_entry;
     }
-    if(inbuff ==0)
-        buffer_small_msgs(msg, size, destNode, tag);
-    return status;
+    local_smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+    local_smsg_attr->mbox_maxcredit = SMSG_MAX_CREDIT;
+    local_smsg_attr->msg_maxsize = SMSG_MAX_MSG;
+    local_smsg_attr->mbox_offset = mailbox_list->offset;
+    mailbox_list->offset += smsg_memlen;
+    local_smsg_attr->buff_size = smsg_memlen;
+    local_smsg_attr->msg_buffer = mailbox_list->mailbox_base;
+    local_smsg_attr->mem_hndl = mailbox_list->mem_hndl;
 }
 
-// Get first 0 in DMA_tags starting from index
-static int get_first_avail_bit(uint64_t DMA_tags, int start_index)
+/* useDynamicSMSG */
+inline 
+static int connect_to(int destNode)
 {
+    gni_return_t status = GNI_RC_NOT_DONE;
+    CmiAssert(smsg_connected_flag[destNode] == 0);
+    CmiAssert (smsg_attr_vector_local[destNode] == NULL);
+    smsg_attr_vector_local[destNode] = (gni_smsg_attr_t*) malloc (sizeof(gni_smsg_attr_t));
+    alloc_smsg_attr(smsg_attr_vector_local[destNode]);
+    smsg_attr_vector_remote[destNode] = (gni_smsg_attr_t*) malloc (sizeof(gni_smsg_attr_t));
+    
+    CMI_GNI_LOCK
+    status = GNI_EpPostDataWId (ep_hndl_array[destNode], smsg_attr_vector_local[destNode], sizeof(gni_smsg_attr_t),smsg_attr_vector_remote[destNode] ,sizeof(gni_smsg_attr_t), destNode+mysize);
+    CMI_GNI_UNLOCK
+    if (status == GNI_RC_ERROR_RESOURCE) {
+      /* possibly destNode is making connection at the same time */
+      free(smsg_attr_vector_local[destNode]);
+      smsg_attr_vector_local[destNode] = NULL;
+      free(smsg_attr_vector_remote[destNode]);
+      smsg_attr_vector_remote[destNode] = NULL;
+      mailbox_list->offset -= smsg_memlen;
+      return 0;
+    }
+    GNI_RC_CHECK("GNI_Post", status);
+    smsg_connected_flag[destNode] = 1;
+    return 1;
+}
 
-    uint64_t         mask = 0x1;
-    register    int     i=0;
-    while((DMA_tags & mask) && i<DMA_slots) {mask << 1; i++;}
-
+#if PIGGYBACK_ACK
+static void * piggyback_ack(int destNode, int msgsize, int *count)
+{
+    int i;
+    if (PCQueueEmpty(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf)) return NULL;
+    int len = PCQueueLength(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf);
+    int piggycount = (SMSG_MAX_MSG - msgsize)/sizeof(uint64_t);
+    if (piggycount > len+1) piggycount = len + 1;
+    if (piggycount <= 5) return NULL;
+    uint64_t * buf = (uint64_t*)CmiTmpAlloc(piggycount * sizeof(uint64_t));
+    CmiAssert(buf != NULL);
+    buf[0] = piggycount-1;
+//printf("[%d] piggyback_ack: %d\n", myrank, piggycount);
+    for (i=0; i<piggycount-1; i++) {
+        MSG_LIST *ptr = (MSG_LIST*)PCQueuePop(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf);
+        CmiAssert(ptr != NULL);
+        ACK_MSG *msg = ptr->msg;
+        buf[i+1] = msg->source_addr;
+        FreeAckMsg(msg);
+        FreeMsgList(ptr);
+    }
+    *count = piggycount;
+    return buf;
 }
 
-static int send_medium_messages(int destNode, int size, char *msg)
+
+static void piggyback_ack_done(int destNode, uint64_t *buf, int done)
 {
-#if 0
-    gni_return_t status = GNI_RC_SUCCESS;
-    int first_avail_bit=0;
-    uint64_t mask = 0x1;
-    MEDIUM_MSG_CONTROL  *medium_msg_control_tmp;
-    MEDIUM_MSG_LIST        *msg_tmp;
-    int blocksize, remain_size, pos;
-    int sub_id = 0;
-    remain_size = size;
-    pos = 0;  //offset before which data are sent
-    /* copy blocks of the message to DMA preallocated buffer and send SMSG */
-    //Check whether there is any available DMA buffer
-    
-    do{
-        while((DMA_avail_tag & mask) && first_avail_bit<DMA_slots) {mask << 1; first_avail_bit++;}
-        if(first_avail_bit == DMA_slots) //No available DMA, buffer this message
-        {
-            MallocMediumMsgList(msg_tmp);
+    if (!done)
+    {
+        int i;
+        for (i=0; i<buf[0]; i++) {
+            MSG_LIST *msg_tmp;
+            MallocMsgList(msg_tmp);
+            ACK_MSG  *ack_msg;
+            MallocAckMsg(ack_msg);
+            ack_msg->source_addr = buf[i+1];
+            msg_tmp->size = ACK_MSG_SIZE;
+            msg_tmp->msg = ack_msg;
+            msg_tmp->tag = ACK_TAG;
             msg_tmp->destNode = destNode;
-            msg_tmp->msg_id   = lrts_send_msg_id;
-            msg_tmp->msg_subid   = sub_id;
-            msg_tmp->size   = remain_size;
-            msg_tmp->msg    = msg+pos;
-            msg_tmp->next   = NULL;
-            break;
-        }else
-        {
-            //copy this part of the message into this DMA buffer
-            //TODO optimize here, some data can go with this SMSG
-            blocksize = (remain_size>DMA_SIZE_PER_SLOT)?DMA_SIZE_PER_SLOT: remain_size;
-            memcpy(DMA_buffer_base_mdh_addr.addr[first_avail_bit], msg+pos, blocksize);
-            pos += blocksize;
-            remain_size -= blocksize;
-            SET_BITS(DMA_avail_tag, first_avail_bit);
-           
-            MallocMediumControlMsg(medium_msg_control_tmp);
-            medium_msg_control_tmp->msg_id = lrts_send_msg_id;
-            medium_msg_control_tmp->msg_subid = sub_id;
-            if(status == GNI_RC_SUCCESS)
-            {
-                if(sub_id==0)
-                    status = GNI_SmsgSendWTag(ep_hndl_array[destNode], NULL, 0, medium_msg_tmp, sizeof(MEDIUM_MSG_CONTROL), 0, MEDIUM_HEAD_TAG);
-                else
-                    status = GNI_SmsgSendWTag(ep_hndl_array[destNode], NULL, 0, medium_msg_tmp, sizeof(MEDIUM_MSG_CONTROL), 0, MEDIUM_DATA_TAG);
-            }
-            //buffer this smsg
-            if(status != GNI_RC_SUCCESS)
-            {
-                buffer_small_msgs(medium_msg_tmp, sizeof(MEDIUM_MSG_CONTROL), destNode, MEDIUM_HEAD_TAG);
-            }
-            sub_id++;
-        }while(remain_size > 0 );
-
+            PCQueuePush(smsg_ack_queue.smsg_msglist_index[destNode].sendSmsgBuf, (char*)msg_tmp);
         }
     }
-#endif
+    CmiTmpFree(buf);
 }
+#endif
 
-inline static CONTROL_MSG* construct_control_msg(int size, char *msg)
+inline 
+static gni_return_t send_smsg_message(SMSG_QUEUE *queue, int destNode, void *msg, int size, uint8_t tag, int inbuff )
+{
+    unsigned int          remote_address;
+    uint32_t              remote_id;
+    gni_return_t          status = GNI_RC_ERROR_RESOURCE;
+    gni_smsg_attr_t       *smsg_attr;
+    gni_post_descriptor_t *pd;
+    gni_post_state_t      post_state;
+    char                  *real_data; 
+
+    if (useDynamicSMSG) {
+        switch (smsg_connected_flag[destNode]) {
+        case 0: 
+            connect_to(destNode);         /* continue to case 1 */
+        case 1:                           /* pending connection, do nothing */
+            status = GNI_RC_NOT_DONE;
+            if(inbuff ==0)
+                buffer_small_msgs(queue, msg, size, destNode, tag);
+            return status;
+        }
+    }
+#if CMK_SMP
+#if ! ONE_SEND_QUEUE
+    if(PCQueueEmpty(queue->smsg_msglist_index[destNode].sendSmsgBuf) || inbuff==1)
+#endif
+    {
+#else
+    if(queue->smsg_msglist_index[destNode].sendSmsgBuf == 0 || inbuff==1)
+    {
+#endif
+        uint64_t *buf = NULL;
+        int bufsize = 0;
+#if PIGGYBACK_ACK
+        if (tag == SMALL_DATA_TAG) {
+            int nack = 0;
+            buf = piggyback_ack(destNode, size, &nack);
+            if (buf) {
+                tag = SMALL_DATA_ACK_TAG;
+                bufsize = nack * sizeof(uint64_t);
+            }
+        }
+#endif
+        CMI_GNI_LOCK
+#if CMK_SMP_TRACE_COMMTHREAD
+        int oldpe = -1;
+        int oldeventid = -1;
+        if(tag == SMALL_DATA_TAG || tag == LMSG_INIT_TAG)
+        { 
+            START_EVENT();
+            if ( tag == SMALL_DATA_TAG)
+                real_data = (char*)msg; 
+            else 
+                real_data = (char*)(((CONTROL_MSG*)msg)->source_addr);
+            TRACE_COMM_GET_MSGID(real_data, &oldpe, &oldeventid);
+            TRACE_COMM_SET_COMM_MSGID(real_data);
+        }
+#endif
+        status = GNI_SmsgSendWTag(ep_hndl_array[destNode], buf, bufsize, msg, size, 0, tag);
+#if CMK_SMP_TRACE_COMMTHREAD
+        if (oldpe != -1)  TRACE_COMM_SET_MSGID(real_data, oldpe, oldeventid);
+#endif
+        CMI_GNI_UNLOCK
+        if(status == GNI_RC_SUCCESS)
+        {
+#if CMK_SMP_TRACE_COMMTHREAD
+            if(tag == SMALL_DATA_TAG || tag == LMSG_INIT_TAG || tag == SMALL_DATA_ACK_TAG)
+            { 
+                TRACE_COMM_CREATION(CpvAccess(projTraceStart), real_data);
+            }
+#endif
+            smsg_send_count ++;
+        }else
+            status = GNI_RC_ERROR_RESOURCE;
+#if PIGGYBACK_ACK
+        if (buf) {
+            piggyback_ack_done(destNode, buf, status==GNI_RC_SUCCESS);
+            tag = SMALL_DATA_TAG;
+        }
+#endif
+    }
+    if(status != GNI_RC_SUCCESS && inbuff ==0)
+        buffer_small_msgs(queue, msg, size, destNode, tag);
+    return status;
+}
+
+inline 
+static CONTROL_MSG* construct_control_msg(int size, char *msg, uint8_t seqno)
 {
     /* construct a control message and send */
     CONTROL_MSG         *control_msg_tmp;
     MallocControlMsg(control_msg_tmp);
-    control_msg_tmp->source_addr    = (uint64_t)msg;
-    control_msg_tmp->source         = myrank;
-    control_msg_tmp->length         =ALIGN64(size); //for GET 4 bytes aligned 
+    control_msg_tmp->source_addr = (uint64_t)msg;
+    control_msg_tmp->seq_id    = seqno;
+    control_msg_tmp->total_length = control_msg_tmp->length = ALIGN64(size); //for GET 4 bytes aligned 
 #if     USE_LRTS_MEMPOOL
     if(size < BIG_MSG)
+    {
         control_msg_tmp->source_mem_hndl = GetMemHndl(msg);
+    }
     else
     {
-        control_msg_tmp->source_mem_hndl.qword1 = 0;
-        control_msg_tmp->source_mem_hndl.qword2 = 0;
+        SetMemHndlZero(control_msg_tmp->source_mem_hndl);
+        control_msg_tmp->length = size - (seqno-1)*ONE_SEG;
+        if (control_msg_tmp->length > ONE_SEG) control_msg_tmp->length = ONE_SEG;
     }
 #else
-    control_msg_tmp->source_mem_hndl.qword1 = 0;
-    control_msg_tmp->source_mem_hndl.qword2 = 0;
+    SetMemHndlZero(control_msg_tmp->source_mem_hndl);
 #endif
     return control_msg_tmp;
 }
 
-// Large message, send control to receiver, receiver register memory and do a GET 
+#define BLOCKING_SEND_CONTROL    0
+
+// Large message, send control to receiver, receiver register memory and do a GET, 
+// return 1 - send no success
 inline
-static void send_large_messages(int destNode, CONTROL_MSG  *control_msg_tmp)
+static gni_return_t send_large_messages(SMSG_QUEUE *queue, int destNode, CONTROL_MSG  *control_msg_tmp, int inbuff)
 {
-    gni_return_t        status  =   GNI_RC_SUCCESS;
+    gni_return_t        status  =  GNI_RC_ERROR_NOMEM;
     uint32_t            vmdh_index  = -1;
     int                 size;
-
-    size    =   control_msg_tmp->length;
-#if     USE_LRTS_MEMPOOL
-    if( control_msg_tmp ->seq_id == 0 ){
-        status = send_smsg_message( destNode, 0, 0, control_msg_tmp, sizeof(CONTROL_MSG), LMSG_INIT_TAG, 0);  
-        if(status == GNI_RC_SUCCESS)
-        {
-            FreeControlMsg(control_msg_tmp);
+    int                 offset = 0;
+    uint64_t            source_addr;
+    int                 register_size; 
+    void                *msg;
+
+    size    =   control_msg_tmp->total_length;
+    source_addr = control_msg_tmp->source_addr;
+    register_size = control_msg_tmp->length;
+
+#if  USE_LRTS_MEMPOOL
+    if( control_msg_tmp->seq_id == 0 ){
+#if BLOCKING_SEND_CONTROL
+        if (inbuff == 0 && IsMemHndlZero(GetMemHndl(source_addr))) {
+            while (IsMemHndlZero(GetMemHndl(source_addr)) && buffered_send_msg + GetMempoolsize((void*)source_addr) >= MAX_BUFF_SEND)
+                LrtsAdvanceCommunication(0);
         }
-    }else
-    {
-        if( control_msg_tmp->seq_id == 1)
-            size = size>ONE_SEG?ONE_SEG:size;
-
-        status = MEMORY_REGISTER(onesided_hnd, nic_hndl, control_msg_tmp->source_addr, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), &omdh);
-        if(status == GNI_RC_SUCCESS)
+#endif
+        if(IsMemHndlZero(GetMemHndl(source_addr))) //it is in mempool, it is possible to be de-registered by others
         {
-            status = send_smsg_message( destNode, 0, 0, control_msg_tmp, sizeof(CONTROL_MSG), LMSG_INIT_TAG, 0);  
+            msg = (void*)source_addr;
+            if(buffered_send_msg + GetMempoolsize(msg) >= MAX_BUFF_SEND)
+            {
+                if(!inbuff)
+                    buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, LMSG_INIT_TAG);
+                return GNI_RC_ERROR_NOMEM;
+            }
+            //register the corresponding mempool
+            status = registerMemory(GetMempoolBlockPtr(msg), GetMempoolsize(msg), &(GetMemHndl(msg)));
             if(status == GNI_RC_SUCCESS)
             {
-                FreeControlMsg(control_msg_tmp);
+                control_msg_tmp->source_mem_hndl = GetMemHndl(source_addr);
             }
-        } else if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR)
+        }else
         {
-            CmiAbort("Memory registor for large msg\n");
-        }else 
+            control_msg_tmp->source_mem_hndl = GetMemHndl(source_addr);
+            status = GNI_RC_SUCCESS;
+        }
+        if(NoMsgInSend( control_msg_tmp->source_addr))
+            register_size = GetMempoolsize((void*)(control_msg_tmp->source_addr));
+        else
+            register_size = 0;
+    }else if(control_msg_tmp->seq_id >0)    // BIG_MSG
+    {
+        int offset = ONE_SEG*(control_msg_tmp->seq_id-1);
+        source_addr += offset;
+        size = control_msg_tmp->length;
+#if BLOCKING_SEND_CONTROL
+        if (inbuff == 0 && IsMemHndlZero(control_msg_tmp->source_mem_hndl)) {
+            while (IsMemHndlZero(control_msg_tmp->source_mem_hndl) && buffered_send_msg + size >= MAX_BUFF_SEND)
+                LrtsAdvanceCommunication(0);
+        }
+#endif
+        if (IsMemHndlZero(control_msg_tmp->source_mem_hndl)) {
+            if(buffered_send_msg + size >= MAX_BUFF_SEND)
+            {
+                if(!inbuff)
+                    buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, LMSG_INIT_TAG);
+                return GNI_RC_ERROR_NOMEM;
+            }
+            status = registerMemory((void*)source_addr, ALIGN64(size), &(control_msg_tmp->source_mem_hndl));
+            if(status == GNI_RC_SUCCESS) buffered_send_msg += ALIGN64(size);
+        }
+        else
         {
-            buffer_small_msgs(control_msg_tmp, sizeof(CONTROL_MSG), destNode, LMSG_INIT_TAG);
+            status = GNI_RC_SUCCESS;
         }
+        register_size = 0;  
+    }
 
+    if(status == GNI_RC_SUCCESS)
+    {
+        status = send_smsg_message(queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, LMSG_INIT_TAG, inbuff);  
+        if(status == GNI_RC_SUCCESS)
+        {
+            buffered_send_msg += register_size;
+            if(control_msg_tmp->seq_id == 0)
+            {
+                IncreaseMsgInSend(source_addr);
+            }
+            FreeControlMsg(control_msg_tmp);
+            MACHSTATE5(8, "GO SMSG LARGE to %d (%d,%d,%d) tag=%d\n", destNode, buffered_send_msg, buffered_recv_msg, register_memory_size, LMSG_INIT_TAG); 
+        }else
+            status = GNI_RC_ERROR_RESOURCE;
+
+    } else if (status == GNI_RC_INVALID_PARAM || status == GNI_RC_PERMISSION_ERROR)
+    {
+        CmiAbort("Memory registor for large msg\n");
+    }else 
+    {
+        status = GNI_RC_ERROR_NOMEM; 
+        if(!inbuff)
+            buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, LMSG_INIT_TAG);
     }
+    return status;
 #else
-    status = MEMORY_REGISTER(onesided_hnd, nic_hndl,msg, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), &omdh);
+    MEMORY_REGISTER(onesided_hnd, nic_hndl,msg, ALIGN64(size), &(control_msg_tmp->source_mem_hndl), &omdh, status)
     if(status == GNI_RC_SUCCESS)
     {
-        status = send_smsg_message( destNode, 0, 0, control_msg_tmp, sizeof(CONTROL_MSG), LMSG_INIT_TAG, 0);  
+        status = send_smsg_message(queue, destNode, control_msg_tmp, CONTROL_MSG_SIZE, LMSG_INIT_TAG, 0);  
         if(status == GNI_RC_SUCCESS)
         {
             FreeControlMsg(control_msg_tmp);
@@ -869,8 +1386,9 @@ static void send_large_messages(int destNode, CONTROL_MSG  *control_msg_tmp)
         CmiAbort("Memory registor for large msg\n");
     }else 
     {
-        buffer_small_msgs(control_msg_tmp, sizeof(CONTROL_MSG), destNode, LMSG_INIT_TAG);
+        buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, LMSG_INIT_TAG);
     }
+    return status;
 #endif
 }
 
@@ -881,90 +1399,266 @@ inline void LrtsPrepareEnvelope(char *msg, int size)
 
 CmiCommHandle LrtsSendFunc(int destNode, int size, char *msg, int mode)
 {
-
     gni_return_t        status  =   GNI_RC_SUCCESS;
     uint8_t tag;
     CONTROL_MSG         *control_msg_tmp;
+    int                 oob = ( mode & OUT_OF_BAND);
+    SMSG_QUEUE          *queue;
+
+    MACHSTATE5(8, "GO LrtsSendFn %d(%d) (%d,%d, %d) \n", destNode, size, buffered_send_msg, buffered_recv_msg, register_memory_size); 
+#if CMK_USE_OOB
+    queue = oob? &smsg_oob_queue : &smsg_queue;
+#else
+    queue = &smsg_queue;
+#endif
+
     LrtsPrepareEnvelope(msg, size);
-#if CMK_SMP
-#if COMM_THREAD_SEND
+
+#if PRINT_SYH
+    printf("LrtsSendFn %d==>%d, size=%d\n", myrank, destNode, size);
+#endif 
+#if CMK_SMP && COMM_THREAD_SEND
     if(size <= SMSG_MAX_MSG)
-        buffer_small_msgs(msg, size, destNode, SMALL_DATA_TAG);
-    else
-    {
-        control_msg_tmp =  construct_control_msg(size, msg);
-        if(size < BIG_MSG)
-            control_msg_tmp->seq_id = 0;
-        else
-        {
-            control_msg_tmp->seq_id = 1;
-        }
-        buffer_small_msgs(control_msg_tmp, sizeof(CONTROL_MSG), destNode, LMSG_INIT_TAG);
+        buffer_small_msgs(queue, msg, size, destNode, SMALL_DATA_TAG);
+    else if (size < BIG_MSG) {
+        control_msg_tmp =  construct_control_msg(size, msg, 0);
+        buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, LMSG_INIT_TAG);
     }
-#endif
-#else
+    else {
+          CmiSetMsgSeq(msg, 0);
+          control_msg_tmp =  construct_control_msg(size, msg, 1);
+          buffer_small_msgs(queue, control_msg_tmp, CONTROL_MSG_SIZE, destNode, LMSG_INIT_TAG);
+    }
+#else   //non-smp, smp(worker sending)
     if(size <= SMSG_MAX_MSG)
     {
-        status = send_smsg_message( destNode, 0, 0, msg, size, SMALL_DATA_TAG, 0);  
-        if(status == GNI_RC_SUCCESS)
-        {
+        if (GNI_RC_SUCCESS == send_smsg_message(queue, destNode,  msg, size, SMALL_DATA_TAG, 0))
             CmiFree(msg);
-        }
     }
-    else
-    {
-        control_msg_tmp =  construct_control_msg(size, msg);
+    else if (size < BIG_MSG) {
+        control_msg_tmp =  construct_control_msg(size, msg, 0);
+        send_large_messages(queue, destNode, control_msg_tmp, 0);
+    }
+    else {
 #if     USE_LRTS_MEMPOOL
-        if(size < BIG_MSG)
-            control_msg_tmp->seq_id = 0;
-        else
-        {
-            control_msg_tmp->seq_id = 1;
-        }
+        CmiSetMsgSeq(msg, 0);
+        control_msg_tmp =  construct_control_msg(size, msg, 1);
+        send_large_messages(queue, destNode, control_msg_tmp, 0);
 #else
-        control_msg_tmp->seq_id = 0;
+        control_msg_tmp =  construct_control_msg(size, msg, 0);
+        send_large_messages(queue, destNode, control_msg_tmp, 0);
 #endif
-        send_large_messages(destNode, control_msg_tmp);
     }
 #endif
     return 0;
 }
 
-/* Idle-state related functions: called in non-smp mode */
-void CmiNotifyIdleForGemini(void) {
-    AdvanceCommunication();
-    //LrtsAdvanceCommunication();
+static void    PumpDatagramConnection();
+static void registerUserTraceEvents() {
+#if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && !CMK_TRACE_IN_CHARM
+    traceRegisterUserEvent("setting up connections", 10);
+    traceRegisterUserEvent("Receiving small msgs", 20);
+    traceRegisterUserEvent("Release local transaction", 30);
+    traceRegisterUserEvent("Sending buffered small msgs", 40);
+    traceRegisterUserEvent("Sending buffered rdma msgs", 50);
+#endif
+}
+
+static void ProcessDeadlock()
+{
+    static CmiUInt8 *ptr = NULL;
+    static CmiUInt8  last = 0, mysum, sum;
+    static int count = 0;
+    gni_return_t status;
+    int i;
+
+//printf("[%d] comm thread detected hang %d %d %d\n", CmiMyPe(), smsg_send_count, smsg_recv_count, count);
+//sweep_mempool(CpvAccess(mempool));
+    if (ptr == NULL) ptr = (CmiUInt8*)malloc(mysize * sizeof(CmiUInt8));
+    mysum = smsg_send_count + smsg_recv_count;
+    MACHSTATE5(9,"Before allgather Progress Deadlock (%d,%d)  (%d,%d)(%d)\n", buffered_send_msg, register_memory_size, last, sum, count); 
+    status = PMI_Allgather(&mysum,ptr,sizeof(CmiUInt8));
+    GNI_RC_CHECK("PMI_Allgather", status);
+    sum = 0;
+    for (i=0; i<mysize; i++)  sum+= ptr[i];
+    if (last == 0 || sum == last) 
+        count++;
+    else
+        count = 0;
+    last = sum;
+    MACHSTATE5(9,"Progress Deadlock (%d,%d)  (%d,%d)(%d)\n", buffered_send_msg, register_memory_size, last, sum, count); 
+    if (count == 2) { 
+        /* detected twice, it is a real deadlock */
+        if (myrank == 0)  {
+            CmiPrintf("Charm++> Network progress engine appears to have stalled, possibly because registered memory limits have been exceeded or are too low.  Try adjusting environment variables CHARM_UGNI_MEMPOOL_MAX and CHARM_UGNI_SEND_MAX (current limits are %lld and %lld).\n", MAX_REG_MEM, MAX_BUFF_SEND);
+            CmiAbort("Fatal> Deadlock detected.");
+        }
+
+    }
+    _detected_hang = 0;
+}
+
+static void CheckProgress()
+{
+    if (smsg_send_count == last_smsg_send_count &&
+        smsg_recv_count == last_smsg_recv_count ) 
+    {
+        _detected_hang = 1;
+#if !CMK_SMP
+        if (_detected_hang) ProcessDeadlock();
+#endif
+
+    }
+    else {
+        //MACHSTATE5(9,"--Check Progress %d(%d, %d) (%d,%d)\n", mycount, buffered_send_msg, register_memory_size, smsg_send_count, smsg_recv_count); 
+        last_smsg_send_count = smsg_send_count;
+        last_smsg_recv_count = smsg_recv_count;
+        _detected_hang = 0;
+    }
+}
+
+static void set_limit()
+{
+    //if (!user_set_flag && CmiMyRank() == 0) {
+    if (CmiMyRank() == 0) {
+        int mynode = CmiPhysicalNodeID(CmiMyPe());
+        int numpes = CmiNumPesOnPhysicalNode(mynode);
+        int numprocesses = numpes / CmiMyNodeSize();
+        MAX_REG_MEM  = _totalmem / numprocesses;
+        MAX_BUFF_SEND = MAX_REG_MEM / 2;
+        if (CmiMyPe() == 0)
+           printf("mem_max = %lld, send_max =%lld\n", MAX_REG_MEM, MAX_BUFF_SEND);
+    }
 }
 
 void LrtsPostCommonInit(int everReturn)
 {
+#if CMK_DIRECT
+    CmiDirectInit();
+#endif
+#if CMI_MPI_TRACE_USEREVENTS && CMK_TRACE_ENABLED && !CMK_TRACE_IN_CHARM
+    CpvInitialize(double, projTraceStart);
+    /* only PE 0 needs to care about registration (to generate sts file). */
+    if (CmiMyPe() == 0) {
+        registerMachineUserEventsFunction(&registerUserTraceEvents);
+    }
+#endif
+
 #if CMK_SMP
     CmiIdleState *s=CmiNotifyGetState();
     CcdCallOnConditionKeep(CcdPROCESSOR_BEGIN_IDLE,(CcdVoidFn)CmiNotifyBeginIdle,(void *)s);
     CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyStillIdle,(void *)s);
 #else
-    CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyIdleForGemini,NULL);
+    CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyStillIdle,NULL);
+    if (useDynamicSMSG)
+    CcdCallOnConditionKeep(CcdPERIODIC_10ms, (CcdVoidFn) PumpDatagramConnection, NULL);
 #endif
 
+    if (_checkProgress)
+#if CMK_SMP
+    if (CmiMyRank() == 0)
+#endif
+    CcdCallOnConditionKeep(CcdPERIODIC_2minute, (CcdVoidFn) CheckProgress, NULL);
+#if !LARGEPAGE
+    CcdCallOnCondition(CcdTOPOLOGY_AVAIL, (CcdVoidFn)set_limit, NULL);
+#endif
 }
 
 /* this is called by worker thread */
 void LrtsPostNonLocal(){
 #if CMK_SMP
-#if !COMM_THREAD_SEND
+#if MULTI_THREAD_SEND
     if(mysize == 1) return;
     PumpLocalRdmaTransactions();
-    SendBufferMsg();
+#if CMK_USE_OOB
+    if (SendBufferMsg(&smsg_oob_queue) == 1)
+#endif
+    SendBufferMsg(&smsg_queue);
+#if PIGGYBACK_ACK
+    SendBufferMsg(&smsg_ack_queue);
+#endif
     SendRdmaMsg();
 #endif
 #endif
 }
+
+/* useDynamicSMSG */
+static void    PumpDatagramConnection()
+{
+    uint32_t          remote_address;
+    uint32_t          remote_id;
+    gni_return_t status;
+    gni_post_state_t  post_state;
+    uint64_t          datagram_id;
+    int i;
+
+   while ((status = GNI_PostDataProbeById(nic_hndl, &datagram_id)) == GNI_RC_SUCCESS)
+   {
+       if (datagram_id >= mysize) {           /* bound endpoint */
+           int pe = datagram_id - mysize;
+           CMI_GNI_LOCK
+           status = GNI_EpPostDataTestById( ep_hndl_array[pe], datagram_id, &post_state, &remote_address, &remote_id);
+           CMI_GNI_UNLOCK
+           if(status == GNI_RC_SUCCESS && post_state == GNI_POST_COMPLETED)
+           {
+               CmiAssert(remote_id == pe);
+               status = GNI_SmsgInit(ep_hndl_array[pe], smsg_attr_vector_local[pe], smsg_attr_vector_remote[pe]);
+               GNI_RC_CHECK("Dynamic SMSG Init", status);
+#if PRINT_SYH
+               printf("++ Dynamic SMSG&n