added Prof Emmanuel Jeannot's tree match load balancer.
authorGengbin Zheng <gzheng@illinois.edu>
Mon, 28 Mar 2011 20:04:44 +0000 (15:04 -0500)
committerGengbin Zheng <gzheng@illinois.edu>
Mon, 28 Mar 2011 20:04:44 +0000 (15:04 -0500)
17 files changed:
src/ck-ldb/CommonLBs.ci
src/ck-ldb/EveryLB.ci
src/ck-ldb/Make.lb
src/ck-ldb/Makefile_lb.sh
src/ck-ldb/TreeMatchLB.C [new file with mode: 0644]
src/ck-ldb/TreeMatchLB.ci [new file with mode: 0644]
src/ck-ldb/TreeMatchLB.h [new file with mode: 0644]
src/ck-ldb/treebucket.c [new file with mode: 0644]
src/ck-ldb/treebucket.h [new file with mode: 0644]
src/ck-ldb/treemapping.c [new file with mode: 0644]
src/ck-ldb/treemapping.h [new file with mode: 0644]
src/ck-ldb/treematch.c [new file with mode: 0644]
src/ck-ldb/treematch.h [new file with mode: 0644]
src/ck-ldb/treetimings.c [new file with mode: 0644]
src/ck-ldb/treetimings.h [new file with mode: 0644]
src/scripts/Make.cidepends
src/scripts/Make.depends

index 238e5d7005bbf311fba8cab25f9af84b4ca8a61d..dc9bd6e7f86a66300967575e2b8098c43af14c17 100644 (file)
@@ -16,6 +16,7 @@ module CommonLBs {
   extern module RefineLB;
   extern module RefineCommLB;
   extern module RotateLB;
+  extern module TreeMatchLB;
 
   initnode void initCommonLBs(void);
 };
index e7d91b26dc55fec96f731daa470e4d70c01aea56..5bb718348fed287f03a0418db732dbebb45f31f4 100644 (file)
@@ -16,6 +16,7 @@ module EveryLB {
   extern module RefineLB;
   extern module RefineCommLB;
   extern module RotateLB;
+  extern module TreeMatchLB;
   extern module ComboCentLB;
   extern module GraphPartLB;
   extern module GraphBFTLB;
index a4b101a363c3db0d864def71695f998829000398..184c5ac2e4a0abbf0c19bd462e678d3f0fdcfe14 100644 (file)
@@ -15,6 +15,7 @@ ALL_LDBS=\
    $(L)/libmoduleRefineLB.a \
    $(L)/libmoduleRefineCommLB.a \
    $(L)/libmoduleRotateLB.a \
+   $(L)/libmoduleTreeMatchLB.a \
    $(L)/libmoduleComboCentLB.a \
    $(L)/libmoduleGraphPartLB.a \
    $(L)/libmoduleGraphBFTLB.a \
@@ -122,6 +123,12 @@ $(L)/libmoduleRotateLB.a: RotateLB.o
 LBHEADERS += RotateLB.h RotateLB.decl.h
 
 
+$(L)/libmoduleTreeMatchLB.a: TreeMatchLB.o treematch.o treebucket.o treetimings.o treemapping.o
+       $(CHARMC) -o $(L)/libmoduleTreeMatchLB.a TreeMatchLB.o treematch.o treebucket.o treetimings.o treemapping.o
+       
+LBHEADERS += TreeMatchLB.h TreeMatchLB.decl.h
+
+
 $(L)/libmoduleComboCentLB.a: ComboCentLB.o 
        $(CHARMC) -o $(L)/libmoduleComboCentLB.a ComboCentLB.o 
        
@@ -253,6 +260,7 @@ ALL_LB_OBJS=EveryLB.o \
     RefineLB.o \
     RefineCommLB.o \
     RotateLB.o \
+    TreeMatchLB.o \
     ComboCentLB.o \
     GraphPartLB.o \
     GraphBFTLB.o \
@@ -272,7 +280,11 @@ ALL_LB_OBJS=EveryLB.o \
     ScotchLB.o \
     TeamLB.o \
     WSLB.o \
-    manager.o
+    manager.o  \
+    treematch.o  \
+    treetimings.o  \
+    treebucket.o \
+    treemapping.o
 # EveryLB dependecies
 EVERYLB_DEPS=EveryLB.o \
     BlockLB.o \
@@ -290,6 +302,7 @@ EVERYLB_DEPS=EveryLB.o \
     RefineLB.o \
     RefineCommLB.o \
     RotateLB.o \
+    TreeMatchLB.o \
     ComboCentLB.o \
     GraphPartLB.o \
     GraphBFTLB.o \
@@ -304,7 +317,11 @@ EVERYLB_DEPS=EveryLB.o \
     RefineTopoLB.o \
     TopoCentLB.o \
     TopoLB.o \
-    manager.o
+    manager.o \
+    treematch.o  \
+    treetimings.o  \
+    treebucket.o \
+    treemapping.o
 # CommonLBs dependencies
 COMMONLBS_DEPS=CommonLBs.o \
     BlockLB.o \
@@ -322,7 +339,12 @@ COMMONLBS_DEPS=CommonLBs.o \
     RefineLB.o \
     RefineCommLB.o \
     RotateLB.o \
-    manager.o
+    TreeMatchLB.o \
+    manager.o \
+    treematch.o  \
+    treetimings.o  \
+    treebucket.o \
+    treemapping.o
 
 $(L)/libmoduleEveryLB.a: $(EVERYLB_DEPS)
        $(CHARMC) -o $(L)/libmoduleEveryLB.a $(EVERYLB_DEPS)
index 97e94517a386d8c47356d0d2e5116a9f10786339..9dd9c7f17c417f87ac28e1ea9721b0d4f4d89f39 100755 (executable)
@@ -1,6 +1,6 @@
 #!/bin/sh
 UNCOMMON_LDBS="TempAwareGreedyLB MetisLB ScotchLB TeamLB WSLB"
-COMMON_LDBS="BlockLB CommLB DummyLB GreedyAgentLB GreedyCommLB GreedyLB NeighborCommLB NeighborLB OrbLB PhasebyArrayLB RandCentLB RecBipartLB RefineLB RefineCommLB RotateLB"
+COMMON_LDBS="BlockLB CommLB DummyLB GreedyAgentLB GreedyCommLB GreedyLB NeighborCommLB NeighborLB OrbLB PhasebyArrayLB RandCentLB RecBipartLB RefineLB RefineCommLB RotateLB TreeMatchLB"
 OTHER_LDBS="ComboCentLB GraphPartLB GraphBFTLB GridCommLB GridCommRefineLB GridHybridLB GridHybridSeedLB GridMetisLB HbmLB HybridLB RefineKLB RefineTopoLB TopoCentLB TopoLB"
 ALL_LDBS="$COMMON_LDBS $OTHER_LDBS"
 
@@ -25,6 +25,7 @@ do
         [ $bal = 'GridCommRefineLB' ] && manager="manager.o"
         [ $bal = 'GridHybridLB' ] && manager="manager.o"
         [ $bal = 'GridHybridSeedLB' ] && manager="manager.o"
+        [ $bal = 'TreeMatchLB' ] && manager="treematch.o treebucket.o treetimings.o treemapping.o"
        cat >> $out << EOB 
 
 \$(L)/libmodule$bal.a: $bal.o $manager
@@ -64,7 +65,11 @@ for bal in $ALL_LDBS $UNCOMMON_LDBS
 do
        echo "    $bal.o \\" >> $out
 done
-echo "    manager.o" >> $out
+echo "    manager.o  \\" >> $out
+echo "    treematch.o  \\" >> $out
+echo "    treetimings.o  \\" >> $out
+echo "    treebucket.o \\" >> $out
+echo "    treemapping.o" >> $out
 
 echo "# EveryLB dependecies" >> $out
 echo "EVERYLB_DEPS=EveryLB.o \\" >> $out
@@ -72,7 +77,11 @@ for bal in $ALL_LDBS
 do
        echo "    $bal.o \\" >> $out
 done
-echo "    manager.o" >> $out
+echo "    manager.o \\" >> $out
+echo "    treematch.o  \\" >> $out
+echo "    treetimings.o  \\" >> $out
+echo "    treebucket.o \\" >> $out
+echo "    treemapping.o" >> $out
 
 echo "# CommonLBs dependencies" >> $out
 echo "COMMONLBS_DEPS=CommonLBs.o \\" >> $out
@@ -80,7 +89,11 @@ for bal in $COMMON_LDBS
 do
        echo "    $bal.o \\" >> $out
 done
-echo "    manager.o" >> $out
+echo "    manager.o" \\>> $out
+echo "    treematch.o  \\" >> $out
+echo "    treetimings.o  \\" >> $out
+echo "    treebucket.o \\" >> $out
+echo "    treemapping.o" >> $out
 
 cat >> $out <<EOB
 
diff --git a/src/ck-ldb/TreeMatchLB.C b/src/ck-ldb/TreeMatchLB.C
new file mode 100644 (file)
index 0000000..4925dfc
--- /dev/null
@@ -0,0 +1,73 @@
+/*****************************************************************************
+ * $Source$
+ * $Author$
+ * $Date$
+ * $Revision$
+ *****************************************************************************/
+
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+#include <charm++.h>
+extern "C"{
+#include "treemapping.h"
+};
+#include "TreeMatchLB.h"
+
+CreateLBFunc_Def(TreeMatchLB, "TreeMatch load balancer, like a normal one but with empty strategy")
+
+#include "TreeMatchLB.def.h"
+
+TreeMatchLB::TreeMatchLB(const CkLBOptions &opt): CentralLB(opt)
+{
+  lbname = (char*)"TreeMatchLB";
+  if (CkMyPe() == 0)
+    CkPrintf("[%d] TreeMatchLB created\n",CkMyPe());
+}
+
+CmiBool TreeMatchLB::QueryBalanceNow(int _step)
+{
+  return CmiTrue;
+}
+
+
+void TreeMatchLB::work(BaseLB::LDStats* stats)
+{
+  int nb_obj,nb_proc;
+  double **comm_mat;
+  int i;
+  int *permut_vec;
+  int count = stats->nprocs();
+
+  nb_proc=count;
+  nb_obj=stats->n_objs;
+  
+  stats->makeCommHash();
+  // allocate communication matrix
+  comm_mat=(double**)malloc(sizeof(double*)*nb_obj);
+  for(i=0;i<nb_obj;i++){
+    comm_mat[i]=(double*)calloc(nb_obj,sizeof(double));
+  }
+
+  for(i=0;i<stats->n_comm;i++){
+    LDCommData &commData = stats->commData[i];
+    if((!commData.from_proc())&&(commData.recv_type()==LD_OBJ_MSG)){
+      int from = stats->getHash(commData.sender);
+      int to = stats->getHash(commData.receiver.get_destObj());
+      comm_mat[from][to]=commData.bytes;
+      comm_mat[to][from]=commData.bytes;
+    }
+  }
+  
+  TreeMatchMapping(nb_obj,nb_proc,comm_mat,stats->to_proc.getVec());
+
+  // free communication matrix;
+  for(i=0;i<nb_obj;i++){
+      free(comm_mat[i]);
+  }
+  free(comm_mat);
+}
+
+/*@}*/
diff --git a/src/ck-ldb/TreeMatchLB.ci b/src/ck-ldb/TreeMatchLB.ci
new file mode 100644 (file)
index 0000000..7b3bcf2
--- /dev/null
@@ -0,0 +1,9 @@
+module TreeMatchLB {
+
+extern module CentralLB;
+initnode void lbinit(void);
+group [migratable] TreeMatchLB : CentralLB {
+  entry void TreeMatchLB(const CkLBOptions &);  
+};
+
+};
diff --git a/src/ck-ldb/TreeMatchLB.h b/src/ck-ldb/TreeMatchLB.h
new file mode 100644 (file)
index 0000000..ac54b01
--- /dev/null
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * $Source$
+ * $Author$
+ * $Date$
+ * $Revision$
+ *****************************************************************************/
+
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+#ifndef _TREEMATCHLB_H_
+#define _TREEMATCHLB_H_
+
+#include "CentralLB.h"
+#include "TreeMatchLB.decl.h"
+
+void CreateTreeMatchLB();
+
+class TreeMatchLB : public CentralLB {
+public:
+  TreeMatchLB(const CkLBOptions &);
+  TreeMatchLB(CkMigrateMessage *m):CentralLB(m) {}
+  void work(BaseLB::LDStats* stats);
+private:
+  CmiBool QueryBalanceNow(int step);
+};
+
+#endif /* _TREEMATCHLB_H_ */
+
+/*@}*/
diff --git a/src/ck-ldb/treebucket.c b/src/ck-ldb/treebucket.c
new file mode 100644 (file)
index 0000000..4884422
--- /dev/null
@@ -0,0 +1,483 @@
+#include <stdio.h>
+#include <float.h>
+#include <math.h>
+#include <assert.h>
+#include "treematch.h"
+#include "treebucket.h"
+#include "treetimings.h" 
+
+#undef DEBUG
+bucket_list_t global_bl; 
+
+int tab_cmp(const void* x1,const void* x2){ 
+  int *e1,*e2,i1,i2,j1,j2;
+  double **tab;
+  bucket_list_t bl;
+
+  bl=global_bl;
+
+  e1=((int *)x1);
+  e2=((int *)x2);
+
+  
+  tab=bl->tab;
+
+  i1=e1[0];
+  j1=e1[1];
+  i2=e2[0];
+  j2=e2[1];
+
+  return tab[i1][j1]>tab[i2][j2]?-1:1;
+}
+
+
+int old_bucket_id(int i,int j,bucket_list_t bucket_list){
+  double *pivot,val;
+  int n,sup,inf,p;
+  pivot=bucket_list->pivot;
+  n=bucket_list->nb_buckets;  
+  val=bucket_list->tab[i][j];
+  
+  inf=-1;
+  sup=n;
+
+  while(sup-inf>1){
+    p=(sup+inf)/2;
+    //printf("%f [%d,%d,%d]=%f\n",val,inf,p,sup,pivot[p]);
+    if(val<pivot[p]){
+      inf=p;
+      if(inf==sup)
+       inf--;
+    }else{
+      sup=p;
+      if(sup==inf)
+       sup++;
+    }
+  }
+  //exit(-1);
+  return sup;
+}
+
+
+int bucket_id(int i,int j,bucket_list_t bucket_list){
+  double *pivot_tree,val;
+  int p,k;
+  pivot_tree=bucket_list->pivot_tree;
+  val=bucket_list->tab[i][j];
+
+
+
+  p=1;
+  for(k=0;k<bucket_list->max_depth;k++){
+    if(val>pivot_tree[p])
+      p=p*2;
+    else
+      p=p*2+1;
+  }
+
+  return  (int)pivot_tree[p];
+}
+  
+
+
+void  display_bucket(bucket_t *b){
+  printf("\tb.bucket=%p\n",b->bucket);
+  printf("\tb.bucket_len=%d\n",(int)b->bucket_len);
+  printf("\tb.nb_elem=%d\n",(int)b->nb_elem);
+
+}
+
+void check_bucket(bucket_t *b,double **tab,double inf, double sup,int N){
+  int k,i,j;
+  for(k=0;k<b->nb_elem;k++){
+    i=b->bucket[k].i;
+    j=b->bucket[k].j;
+    if((tab[i][j]<inf)||(tab[i][j]>sup)){
+      printf("[%d] (%d,%d):%f not in [%f,%f]\n",k,i,j,tab[i][j],inf,sup);
+      exit(-1);
+    }
+  }
+}
+
+void  display_bucket_list(bucket_list_t bucket_list){
+  int i;
+  double inf,sup;
+
+  for(i=0;i<bucket_list->nb_buckets-1;i++){
+    printf("pivot[%d]=%f\n",i,bucket_list->pivot[i]);
+  }
+  printf("\n");
+
+  for(i=0;i<bucket_list->nb_buckets;i++){
+    inf=bucket_list->pivot[i];
+    sup=bucket_list->pivot[i-1];
+    if(i==0)
+      sup=DBL_MAX;
+    if(i==bucket_list->nb_buckets-1)
+      inf=0;
+    printf("Bucket %d:\n",i);
+    display_bucket(bucket_list->bucket_tab[i]);
+    printf("\n");
+    check_bucket(bucket_list->bucket_tab[i],bucket_list->tab,inf,sup,bucket_list->N);
+  }
+  
+}
+
+void add_to_bucket(int id,int i,int j,bucket_list_t bucket_list){
+  bucket_t *bucket;
+  int N,n,size;
+
+  bucket=bucket_list->bucket_tab[id];
+  //display_bucket(bucket);
+  
+  if(bucket->bucket_len==bucket->nb_elem){
+    N=bucket_list->N;
+    n=bucket_list->nb_buckets;  
+    size=N*N/n;
+    //display_bucket(bucket);
+    bucket->bucket=(coord*)realloc(bucket->bucket,sizeof(coord)*(size+bucket->bucket_len));
+    bucket->bucket_len+=size;
+#ifdef DEBUG
+    printf("malloc/realloc: %d\n",id);
+    printf("(%d,%d)\n",i,j);
+    display_bucket(bucket);
+    printf("\n");
+#endif
+  }
+  
+ bucket->bucket[bucket->nb_elem].i=i;
+ bucket->bucket[bucket->nb_elem].j=j;
+ bucket->nb_elem++;
+
+  //printf("\n");
+  //exit(-1);
+}
+
+void dfs(int i,int inf,int sup,double *pivot,double *pivot_tree,int depth,int max_depth){
+  int p;
+  if(depth==max_depth)
+    return;
+
+  p=(inf+sup)/2;
+  pivot_tree[i]=pivot[p-1];
+
+  dfs(2*i,inf,p-1,pivot,pivot_tree,depth+1,max_depth);
+  dfs(2*i+1,p+1,sup,pivot,pivot_tree,depth+1,max_depth);
+}
+
+void  built_pivot_tree(bucket_list_t bucket_list){
+  double *pivot_tree,*pivot;
+  int n,i,k;
+  pivot=bucket_list->pivot;
+  n=bucket_list->nb_buckets;
+  pivot_tree=(double*)malloc(sizeof(double)*2*n);
+  bucket_list->max_depth=(int)log2(n);
+
+  dfs(1,1,n-1,pivot,pivot_tree,0,bucket_list->max_depth);
+
+  k=0;
+  for(i=n;i<2*n;i++)
+    pivot_tree[i]=k++;
+
+  bucket_list->pivot_tree=pivot_tree;  
+  /*
+  for(i=0;i<2*n;i++)
+    printf("%d:%f\t",i,pivot_tree[i]);
+  printf("\n");
+  */
+}
+
+void fill_buckets(bucket_list_t bucket_list){
+  int N,i,j,id;
+
+  N=bucket_list->N;
+
+  for(i=0;i<N;i++){
+    for(j=i+1;j<N;j++){
+      id=bucket_id(i,j,bucket_list);
+      add_to_bucket(id,i,j,bucket_list);
+    }
+  }
+  
+  
+}
+
+int is_power_of_2(int val){
+  int n=1;
+  do{
+    if(n==val)
+      return 1;
+    n<<=1;
+  }while(n>0);
+  return 0;
+}
+
+
+void partial_sort(bucket_list_t *bl,double **tab,int N,int nb_buckets){
+  int *sample;
+  int i,j,k,n;
+  double *pivot;
+  bucket_list_t bucket_list;
+
+
+  if(!is_power_of_2(nb_buckets)){
+    fprintf(stderr,"Error! Paramater nb_buckets is: %d and should be a power of 2\n",nb_buckets);
+    exit(-1);
+  }
+
+
+  bucket_list=(bucket_list_t)malloc(sizeof(_bucket_list_t));
+
+  bucket_list->tab=tab;
+  bucket_list->N=N;
+
+
+  n=pow(nb_buckets,2);
+  n=N;
+  sample=(int*)malloc(2*sizeof(int)*n);
+  
+  for(k=0;k<n;k++){
+    i=random()%(N-2)+1;
+    if(i==N-2)
+      j=N-1;
+    else
+      j=random()%(N-i-2)+i+1;
+    assert(i!=j);
+    assert(i<j);
+    assert(i<N);
+    assert(j<N);
+    sample[2*k]=i;
+    sample[2*k+1]=j;
+  }
+  
+  global_bl=bucket_list;
+  qsort(sample,n,2*sizeof(int),tab_cmp);
+
+  /*  for(k=0;k<N;k++){
+    i=sample[k]/N;
+    j=sample[k]%N;
+    printf("%f\n",tab[i][j]);
+  }
+  */
+  pivot=(double*)malloc(sizeof(double)*nb_buckets-1);
+  int id=2;
+  for(k=1;k<nb_buckets;k++){
+    i=sample[2*(id-1)];
+    j=sample[2*(id-1)+1];
+    id*=2;
+    
+
+    /*    i=sample[k*N/nb_buckets]/N;
+         j=sample[k*N/nb_buckets]%N;*/
+    pivot[k-1]=tab[i][j];
+    //printf("pivot[%d]=%f\n",k-1,tab[i][j]);
+  }
+
+  bucket_list->pivot=pivot;
+  bucket_list->nb_buckets=nb_buckets;
+  built_pivot_tree(bucket_list);
+  
+  bucket_list->bucket_tab=(bucket_t**)malloc(nb_buckets*sizeof(bucket_t*));
+  for(i=0;i<nb_buckets;i++){
+    bucket_list->bucket_tab[i]=(bucket_t*)calloc(1,sizeof(bucket_t));
+  }
+
+  fill_buckets(bucket_list);
+  
+  //display_bucket_list(bucket_list);
+
+  bucket_list->cur_bucket=0;
+  bucket_list->bucket_indice=0;
+  
+  free(sample);
+
+  *bl=bucket_list;
+}
+
+void next_bucket_elem(bucket_list_t bucket_list,int *i,int *j){
+  int N;
+  bucket_t *bucket=bucket_list->bucket_tab[bucket_list->cur_bucket];
+
+
+  while(bucket->nb_elem<=bucket_list->bucket_indice){
+    bucket_list->bucket_indice=0;
+    bucket=bucket_list->bucket_tab[bucket_list->cur_bucket++];
+
+    printf("### From bucket %d to bucket %d\n",bucket_list->cur_bucket-1,bucket_list->cur_bucket);
+  }
+
+  if(!bucket->sorted){
+    global_bl=bucket_list;
+    qsort(bucket->bucket,bucket->nb_elem,2*sizeof(int),tab_cmp);
+    bucket->sorted=1;
+  }
+
+
+  
+  N=bucket_list->N;
+
+  *i=bucket->bucket[bucket_list->bucket_indice].i;
+  *j=bucket->bucket[bucket_list->bucket_indice].j;
+  bucket_list->bucket_indice++;
+}
+
+
+int add_edge_3(double **tab,tree_t *tab_node, tree_t *parent,int i,int j,int N,int *nb_groups){
+  //printf("%d <-> %d ?\n",tab_node[i].id,tab_node[j].id);
+
+  if((!tab_node[i].parent) && (!tab_node[j].parent)){
+    if(parent){
+      parent->child[0]=&tab_node[i];
+      parent->child[1]=&tab_node[j];
+      tab_node[i].parent=parent;
+      tab_node[j].parent=parent;
+#ifdef DEBUG
+      printf("%d: %d-%d\n",*nb_groups,parent->child[0]->id,parent->child[1]->id);
+#endif
+      return 1;
+    } 
+    return 0;
+  }
+  
+  if(tab_node[i].parent && (!tab_node[j].parent)){
+    parent=tab_node[i].parent;
+    if(!parent->child[2]){
+      parent->child[2]=&tab_node[j];
+      tab_node[j].parent=parent;
+#ifdef DEBUG
+      printf("%d: %d-%d-%d\n",*nb_groups,parent->child[0]->id,parent->child[1]->id,parent->child[2]->id);
+#endif
+      (*nb_groups)++;
+    }
+    return 0;
+  }
+
+  if(tab_node[j].parent && (!tab_node[i].parent)){
+    parent=tab_node[j].parent;
+    if(!parent->child[2]){
+      parent->child[2]=&tab_node[i];
+      tab_node[i].parent=parent;
+#ifdef DEBUG
+      printf("%d: %d-%d-%d\n",*nb_groups,parent->child[0]->id,parent->child[1]->id,parent->child[2]->id);
+#endif      
+      (*nb_groups)++;
+    }
+    return 0;
+  }
+
+  return 0;
+}
+
+int try_add_edge(double **tab,tree_t *tab_node, tree_t *parent,int arity,int i,int j,int N,int *nb_groups){
+
+  assert(i!=j);
+
+  
+  switch(arity){
+  case 2:
+    if(tab_node[i].parent)
+      return 0;
+    if(tab_node[j].parent)
+      return 0;
+
+    parent->child[0]=&tab_node[i];
+    parent->child[1]=&tab_node[j];
+    tab_node[i].parent=parent;
+    tab_node[j].parent=parent;
+    
+    (*nb_groups)++;
+
+    return 1;
+  case 3:
+    return add_edge_3(tab,tab_node,parent,i,j,N,nb_groups);
+  default:
+    fprintf(stderr,"Cannot handle arity %d\n",parent->arity);
+    exit(-1);
+  }
+}
+
+
+void free_bucket(bucket_t *bucket){
+  free(bucket->bucket);
+  free(bucket);
+
+}
+
+void free_tab_bucket(bucket_t **bucket_tab,int N){
+  int i;
+  for(i=0;i<N;i++){
+    free_bucket(bucket_tab[i]);
+  }
+  free(bucket_tab);
+}
+
+
+void free_bucket_list(bucket_list_t bucket_list){
+
+  // Do not free the tab field it is used elsewhere
+
+  free_tab_bucket(bucket_list->bucket_tab,bucket_list->nb_buckets);
+  free(bucket_list->pivot);
+  free(bucket_list->pivot_tree);
+  free(bucket_list);
+}
+
+void bucket_grouping(double **tab,tree_t *tab_node, tree_t *new_tab_node, int arity,int N, int M,long int k){
+  bucket_list_t bucket_list;
+  double duration;
+  int l,i,j,nb_groups;
+  double val=0;
+  TIC;
+  partial_sort(&bucket_list,tab,N,8);
+  duration=TOC;
+  printf("Partial sorting=%fs\n",duration);  
+  TIC;
+  l=0;
+  i=0;
+  nb_groups=0;
+  while(l<M){
+    next_bucket_elem(bucket_list,&i,&j);
+    if(try_add_edge(tab,tab_node,&new_tab_node[l],arity,i,j,N,&nb_groups)){
+      l++;
+    }
+  }
+
+#ifdef DEBUG
+  printf("l=%d,nb_groups=%d\n",l,nb_groups);
+#endif
+
+  while(nb_groups<M){
+    next_bucket_elem(bucket_list,&i,&j);
+    try_add_edge(tab,tab_node,NULL,arity,i,j,N,&nb_groups);
+  }
+
+#ifdef DEBUG
+  printf("l=%d,nb_groups=%d\n",l,nb_groups);
+#endif
+
+  for(l=0;l<M;l++){
+    update_val(tab,&new_tab_node[l],N);      
+    val+=new_tab_node[l].val;
+  }
+
+
+      
+
+
+  duration=TOC;
+  printf("Grouping =%fs\n",duration);  
+
+  printf("Bucket: %d, indice:%d\n",bucket_list->cur_bucket,bucket_list->bucket_indice);
+
+  printf("val=%f\n",val);
+  free_bucket_list(bucket_list);
+
+  //  exit(-1);
+
+  //  display_grouping(new_tab_node,M,arity,val);
+  
+
+}
+
diff --git a/src/ck-ldb/treebucket.h b/src/ck-ldb/treebucket.h
new file mode 100644 (file)
index 0000000..2239d38
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef __BUCKET_H__
+#define __BUCKET_H__
+
+typedef struct{
+  int i;
+  int j;
+}coord;
+
+typedef struct{
+  coord * bucket; /*store i,j*/
+  size_t bucket_len; //allocated size in the heap
+  size_t nb_elem; // number of usefull elements (nb_elem should be lower than bucket_len)
+  int sorted;
+}bucket_t;
+
+typedef struct{
+  bucket_t **bucket_tab;
+  size_t nb_buckets;
+  double **tab;
+  int N;//length of tab
+  //For iterating over the buckets
+  int cur_bucket;
+  int bucket_indice;
+  double *pivot;
+  double *pivot_tree;
+  int max_depth;
+}_bucket_list_t;
+
+typedef _bucket_list_t *bucket_list_t;
+
+void bucket_grouping(double **tab,tree_t *tab_node, tree_t *new_tab_node, int arity,int N, int M,long int k);
+int try_add_edge(double **tab,tree_t *tab_node, tree_t *parent,int arity,int i,int j,int N,int *nb_groups);
+#endif
diff --git a/src/ck-ldb/treemapping.c b/src/ck-ldb/treemapping.c
new file mode 100644 (file)
index 0000000..746ae03
--- /dev/null
@@ -0,0 +1,682 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#include "treetimings.h"
+#include "treematch.h"
+#include <ctype.h>
+#include <math.h>
+#include <assert.h> 
+#include "treemapping.h"
+#define TEST_ERROR(n) {if(n!=0){fprintf(stderr,"Error %d Line %d\n",n,__LINE__);exit(-1);}}
+#undef DEBUG
+
+#define LINE_SIZE 1000000
+
+typedef struct{
+  int  val;
+  long key;
+}hash_t;
+
+
+typedef struct{
+  double val;
+  int key1;
+  int key2;
+}hash2_t;
+
+
+int nb_nodes(tm_topology_t *topology){
+  return topology->nb_nodes[topology->nb_levels-1];
+}
+
+
+void free_topology(tm_topology_t *topology){
+   int i;
+  for(i=0;i<topology->nb_levels;i++)
+    free(topology->node_id[i]);
+  free(topology->node_id);
+  free(topology->nb_nodes);
+  free(topology->arity);
+  free(topology);
+}
+
+double print_sol(int N,int *Value,double **comm, double **arch){
+  double sol;
+  int i,j;
+  double a,c;
+
+  sol=0;
+  for (i=0;i<N;i++){
+    for (j=i+1;j<N;j++){
+      c=comm[i][j];
+      a=arch[Value[i]][Value[j]];
+      //printf("T_%d_%d %f/%f=%f\n",i,j,c,a,c/a);
+      sol+=c/a;
+    }
+  }
+  for (i = 0; i < N; i++) {
+    printf("%d", Value[i]);
+    if(i<N-1)
+      printf(",");
+      
+  }
+  printf(" : %g\n",sol);
+
+  return sol;
+}
+
+
+void print_1D_tab(int *tab,int N){
+  int i;
+  for (i = 0; i < N; i++) {
+    printf("%d", tab[i]);
+    if(i<N-1)
+      printf(",");
+  }
+  printf("\n");
+
+}
+
+int nb_lines(char *filename){
+  FILE *pf;
+  char  line[LINE_SIZE];
+  int N=0;
+
+  if(!(pf=fopen(filename,"r"))){
+      fprintf(stderr,"Cannot open %s\n",filename);
+      exit(-1);
+  }
+
+  while(fgets(line,LINE_SIZE,pf))
+    N++;
+
+  printf("N=%d\n",N);
+
+  fclose(pf);
+  return N;
+}
+
+void init_comm(char *filename,int N,double **comm){
+  int i,j;
+  FILE *pf;
+  char *ptr;
+  char  line[LINE_SIZE];
+
+  if(!(pf=fopen(filename,"r"))){
+    fprintf(stderr,"Cannot open %s\n",filename);
+    exit(-1);
+  }
+
+
+  j=-1;
+  i=0;
+  while(fgets(line,LINE_SIZE,pf)){
+
+  
+    char *l=line;
+    j=0;
+    //printf("%s|",line);
+    while((ptr=strsep(&l," \t"))){
+      if((ptr[0]!='\n')&&(!isspace(ptr[0]))&&(*ptr)){
+       comm[i][j]=atof(ptr);
+       //printf ("comm[%d][%d]=%f|%s|\n",i,j,comm[i][j],ptr);
+       j++;
+      }
+    }
+    if(j!=N){
+      fprintf(stderr,"Error at %d %d (%d!=%d)for %s\n",i,j,j,N,filename);
+      exit(-1);
+    }
+    i++;
+  }
+  if(i!=N){
+    fprintf(stderr,"Error at %d %d for %s\n",i,j,filename);
+    exit(-1);
+  }
+
+  /*  printf("%s:\n",filename);
+  for(i=0;i<N;i++){
+    for(j=0;j<N;j++){
+      printf("%6.1f ",comm[i][j]);
+    }
+    printf("\n");
+    } */
+
+}    
+
+int  build_comm(char *filename,double ***pcomm){
+  int i;
+  int N;
+  double **comm;
+  N=nb_lines(filename);
+  comm=(double**)malloc(N*sizeof(double*));
+  for(i=0;i<N;i++)
+    comm[i]=(double*)malloc(N*sizeof(double));
+  init_comm(filename,N,comm);
+  *pcomm=comm;
+  return N;
+}
+
+void map_Packed(tm_topology_t *topology,int N,int *Value){
+  int i,depth;
+
+  depth=topology->nb_levels-1;
+
+  for(i=0;i<N;i++){
+    //printf ("%d -> %d\n",objs[i]->os_index,i);
+    Value[i]=topology->node_id[depth][i];
+  }
+
+}
+
+void map_RR(int N,int *Value){
+  int i;
+
+  for(i=0;i<N;i++){
+    //printf ("%d -> %d\n",i,i);
+    Value[i]=i;
+  }
+
+}
+
+
+
+int hash_asc(const void* x1,const void* x2){ 
+
+  hash_t *e1,*e2;
+
+  e1=((hash_t*)x1);
+  e2=((hash_t*)x2);
+
+  
+  return e1->key<e2->key?-1:1;
+} 
+
+
+int *generate_random_sol(tm_topology_t *topology,int N,int level,int seed){
+  hash_t *hash_tab;
+  int *sol,i;
+  int *nodes_id;
+
+  nodes_id=topology->node_id[level];
+
+
+  hash_tab=(hash_t*)malloc(sizeof(hash_t)*N);
+  sol=(int*)malloc(sizeof(int)*N);
+  
+  srandom(seed);
+  
+  for(i=0;i<N;i++){
+    hash_tab[i].val=nodes_id[i];
+    hash_tab[i].key=random();
+  }
+  
+  qsort(hash_tab,N,sizeof(hash_t),hash_asc);
+  for(i=0;i<N;i++){
+    sol[i]=hash_tab[i].val;
+  }
+  free(hash_tab);
+  return sol;
+}
+
+
+double eval_sol(int *sol,int N,double **comm, double **arch){
+  double res;
+  int i,j;
+  double a,c;
+
+  res=0;
+  for (i=0;i<N;i++){
+    for (j=i+1;j<N;j++){
+      c=comm[i][j];
+      a=arch[sol[i]][sol[j]];
+      res+=c/a;
+    }
+  }
+  return res;
+}
+
+void exchange(int *sol,int i,int j){
+  int tmp;
+  tmp=sol[i];
+  sol[i]=sol[j];
+  sol[j]=tmp;
+}
+
+double gain_exchange(int *sol,int l,int m,double eval1,int N,double **comm, double **arch){
+  double eval2;
+  if(l==m)
+    return 0;
+  exchange(sol,l,m);
+  eval2=eval_sol(sol,N,comm,arch);
+  exchange(sol,l,m);
+  return eval1-eval2;
+}
+
+void select_max(int *l,int *m,double **gain,int N,int *state){
+  int i,j;
+  double max;
+  max=-DBL_MAX;
+  
+  for(i=0;i<N;i++){
+    if(!state[i]){
+      for(j=0;j<N;j++){
+         if((i!=j)&&(!state[j])){
+           if(gain[i][j]>max){
+             *l=i;*m=j;
+             max=gain[i][j];
+           }
+         }
+      }
+    }
+  }
+  
+}
+
+void compute_gain(int *sol,int N,double **gain,double **comm, double **arch){
+  int i,j;
+  double eval1;
+  eval1=eval_sol(sol,N,comm,arch);
+  for(i=0;i<N;i++){
+    for(j=0;j<=i;j++){
+      gain[i][j]=gain[j][i]=gain_exchange(sol,i,j,eval1,N,comm,arch);
+    }
+  } 
+}
+
+
+
+/* Randomized Algorithm of
+Hu Chen, Wenguang Chen, Jian Huang ,Bob Robert,and H.Kuhn. Mpipp: an automatic profile-guided
+parallel process placement toolset for smp clusters and multiclusters. In
+Gregory K. Egan and Yoichi Muraoka, editors, ICS, pages 353-360. ACM, 2006.
+ */
+
+void map_MPIPP(tm_topology_t *topology,int nb_seed,int N,int *Value,double **comm, double **arch){
+  int *sol;
+  int *state;
+  double **gain;
+  int **history;
+  double *temp;
+  int i,j,t,l=0,m=0,loop=0,seed=0;
+  double max,sum,best_eval,eval;
+
+  
+  gain=(double**)malloc(sizeof(double*)*N);
+  for(i=0;i<N;i++){
+    gain[i]=(double*)malloc(sizeof(double)*N);  
+    if(!gain[i]){
+    }
+  }
+  history=(int**)malloc(sizeof(int*)*N);
+  for(i=0;i<N;i++)
+    history[i]=(int*)malloc(sizeof(int)*3);
+
+  state=(int*)malloc(sizeof(int)*N);
+  temp=(double*)malloc(sizeof(double)*N);
+
+  sol=generate_random_sol(topology,N,topology->nb_levels-1,seed++);
+  for(i=0;i<N;i++)
+    Value[i]=sol[i];
+  
+  best_eval=DBL_MAX;
+  while(seed<=nb_seed){
+    loop=0;
+    do{
+
+      for(i=0;i<N;i++){
+       state[i]=0;
+       //printf("%d ",sol[i]);
+      }
+      //printf("\n");
+      compute_gain(sol,N,gain,comm,arch);
+  
+      //display_tab(gain,N);
+      //exit(-1);
+      for(i=0;i<N/2;i++){
+       select_max(&l,&m,gain,N,state);
+       //printf("%d: %d <=> %d : %f\n",i,l,m,gain[l][m]);
+       state[l]=1;state[m]=1;
+       exchange(sol,l,m);
+       history[i][1]=l;history[i][2]=m;
+       temp[i]=gain[l][m];
+       compute_gain(sol,N,gain,comm,arch);
+      }
+
+      t=-1;
+      max=0;
+      sum=0;
+      for(i=0;i<N/2;i++){
+       sum+=temp[i];
+       if(sum>max){
+         max=sum;
+         t=i;
+       }
+      }
+      /*for(j=0;j<=t;j++)
+       printf("exchanging: %d with %d for gain: %f\n",history[j][1],history[j][2],temp[j]); */
+      for(j=t+1;j<N/2;j++){
+       exchange(sol,history[j][1],history[j][2]);
+       //printf("Undoing: %d with %d for gain: %f\n",history[j][1],history[j][2],temp[j]); 
+      }
+      //printf("max=%f\n",max);
+
+      /*for(i=0;i<N;i++){
+       printf("%d ",sol[i]);
+       }
+       printf("\n");*/
+      eval=eval_sol(sol,N,comm,arch);
+      if(eval<best_eval){
+       best_eval=eval;
+       for(i=0;i<N;i++)
+         Value[i]=sol[i];
+       //print_sol(N);
+      }
+    
+
+    }while(max>0);
+    
+    sol=generate_random_sol(topology,N,topology->nb_levels-1,seed++);
+
+  }
+
+}
+  
+
+
+
+void map_tree(tree_t* t1,tree_t *t2){
+  /*  double x1,x2;
+  if((!t1->left)&&(!t1->right)){
+    printf ("%d -> %d\n",t1->id,t2->id);
+    Value[t2->id]=t1->id;
+   return;
+  }
+  x1=t2->right->val/t1->right->val+t2->left->val/t1->left->val;
+  x2=t2->left->val/t1->right->val+t2->right->val/t1->left->val;
+  if(x1<x2){
+    map_tree(t1->left,t2->left);
+    map_tree(t1->right,t2->right);
+  }else{
+    map_tree(t1->right,t2->left);
+    map_tree(t1->left,t2->right);
+    }*/
+}
+
+void depth_first(tree_t *comm_tree, int *proc_list,int *i){
+  int j;
+  if(!comm_tree->child){
+    proc_list[(*i)++]=comm_tree->id;
+    return;
+  }
+
+  for(j=0;j<comm_tree->arity;j++){
+    depth_first(comm_tree->child[j],proc_list,i);
+  }
+}
+
+int nb_leaves(tree_t *comm_tree){
+  int n=0,j;
+
+  if(!comm_tree->child){
+    return 1;
+  }
+
+  for(j=0;j<comm_tree->arity;j++){
+    n+=nb_leaves(comm_tree->child[j]);
+  }
+  return n;
+}
+
+
+
+
+/*Map topology to cores: 
+ sigma_i is such that  process i is mapped on core sigma_i
+ k_i is such that core i exectutes process k_i
+
+ size of sigma is the number of process
+ size of k is the number of cores/nodes
+
+ We must have numbe of process<=number of cores
+
+ k_i =-1 if no process is mapped on core i
+*/
+void map_topology(tm_topology_t *topology,tree_t *comm_tree,int nb_proc,int level,
+                 int *sigma, int *k){
+  int *nodes_id;
+  int N;
+  int *proc_list,i,l;
+  int M;
+  int block_size;
+
+  M=nb_leaves(comm_tree);
+  printf("nb_leaves=%d\n",M);
+
+
+
+  nodes_id=topology->node_id[level];
+  N=topology->nb_nodes[level];
+  //printf("level=%d, nodes_id=%p, N=%d\n",level,nodes_id,N);
+
+
+  //printf("N=%d,nb_proc=%d\n",N,nb_proc);
+  /* The number of node at level "level" in the tree should be equal to the number of processors*/
+  assert(N==nb_proc);
+
+
+  proc_list=(int*)malloc(sizeof(int)*M);
+  i=0;
+  depth_first(comm_tree,proc_list,&i);
+
+  l=0;
+  for(i=0;i<M;i++){
+    //printf ("%d\n",proc_list[i]);
+  }
+
+
+  block_size=M/N;
+
+
+  if(k){/*if we need the k vector*/
+    printf("M=%d, N=%d, BS=%d\n",M,N,block_size);
+    for(i=0;i<nb_nodes(topology);i++){
+      k[i]=-1;
+    }
+    for(i=0;i<M;i++){
+      if(proc_list[i]!=-1){
+#ifdef DEBUG
+       printf ("%d->%d\n",proc_list[i],nodes_id[i/block_size]);
+#endif
+       sigma[proc_list[i]]=nodes_id[i/block_size];
+       k[nodes_id[i/block_size]]=proc_list[i];
+      }
+    }
+  }else{
+    printf("M=%d, N=%d, BS=%d\n",M,N,block_size);
+    for(i=0;i<M;i++){
+      if(proc_list[i]!=-1){
+#ifdef DEBUG
+       printf ("%d->%d\n",proc_list[i],nodes_id[i/block_size]);
+#endif
+       sigma[proc_list[i]]=nodes_id[i/block_size];
+      }
+    }
+
+  }
+  free(proc_list);
+
+}
+
+void map_topology_simple(tm_topology_t *topology,tree_t *comm_tree, int *sigma,int *k){
+  map_topology(topology,comm_tree,topology->nb_nodes[topology->nb_levels-1],topology->nb_levels-1,sigma,k);
+}
+
+int int_cmp(const void* x1,const void* x2){ 
+
+  int *e1,*e2;
+
+  e1=((int *)x1);
+  e2=((int *)x2);
+
+  
+  return (*e1)>(*e2)?-1:1;
+} 
+
+
+int decompose(int n,int optimize,int *tab){
+  int primes[6]={2,3,5,7,11,0},i=0;
+  int flag=2;
+  int j=1;
+
+
+  while(primes[i]&&(n!=1)){
+    printf("[%d] before=%d\n",primes[i],n);
+    if(flag&&optimize&&(n%primes[i]!=0)){
+      n+=primes[i]-n%primes[i];
+      flag--;
+      i=0;
+      continue;
+    }
+    printf("after=%d\n",n);
+    if(n%primes[i]==0){
+      tab[j++]=primes[i];
+      n/=primes[i];
+    }else{
+      i++;
+      flag=1;
+    }
+  }
+  if(n!=1){
+    tab[j++]=n;
+  }
+
+  qsort(tab+1,j-1,sizeof(int),int_cmp);
+
+  for(i=0;i<j;i++)
+    printf("%d:",tab[i]);
+  printf("\n");
+
+  tab[j]=0;
+  
+  return j+1;
+}
+
+
+tree_t *build_synthetic_topology(int *synt_tab,int id,int depth,int nb_levels){
+  tree_t *res,**child;
+  int arity=synt_tab[0];
+  int val,i; 
+
+  res=(tree_t*)malloc(sizeof(tree_t));
+  val=0;
+  if(depth>=nb_levels)
+    child=NULL;
+  else{
+    child=(tree_t**)malloc(sizeof(tree_t*)*arity);
+    for(i=0;i<arity;i++){
+      child[i]=build_synthetic_topology(synt_tab+1,i,depth+1,nb_levels);
+    child[i]->parent=res;
+    val+=child[i]->val;
+    }
+  }
+  set_node(res,child,arity,NULL,id,val+speed(depth),child[0]);
+  return res;
+}
+
+
+void   build_synthetic_proc_id(tm_topology_t *topology){
+  int n=1,i,j;
+  topology->node_id=(int**)malloc(sizeof(int*)*topology->nb_levels);
+  topology->nb_nodes=(int*)malloc(sizeof(int)*topology->nb_levels);
+
+
+  for(i=0;i<topology->nb_levels;i++){
+    topology->nb_nodes[i]=n;
+    topology->node_id[i]=(int*)malloc(sizeof(int)*n);
+    for(j=0;j<n;j++)
+      topology->node_id[i][j]=j;
+    n*=topology->arity[i]; 
+  }
+
+}
+
+void TreeMatchMapping(int nb_obj, int nb_proc,double **comm_mat,  int *sol){
+  tree_t *comm_tree;
+  tm_topology_t *topology;
+
+  int i;
+  for(i=0;i<nb_obj;i++)
+    sol[i]=i;
+
+
+  //  return;
+
+  topology=(tm_topology_t*)malloc(sizeof(tm_topology_t));
+  topology->arity=(int*)malloc(sizeof(int)*MAX_LEVELS);
+  topology->arity[0]=nb_proc;
+  topology->nb_levels=decompose((int)ceil((1.0*nb_obj)/nb_proc),1,topology->arity);
+  printf("Topology nb levels=%d\n",topology->nb_levels);
+  build_synthetic_proc_id(topology);
+
+
+  //exit(-1);
+  //topology_to_arch(topology);
+
+  //display_tab(arch,hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PROC));
+  //display_tab(arch,96);
+  //exit(-1);
+  //int nb_core=topo_nb_proc(topology,1000);
+
+  //display_tab(comm_mat,N);
+
+  comm_tree=build_tree_from_topology(topology,comm_mat,nb_obj);
+  map_topology(topology,comm_tree,nb_proc,1,sol,NULL);
+
+
+  free_topology(topology);
+  free_tree(comm_tree);
+  printf("-------------- Mapping done!\n");
+}
+
+void display_other_heuristics(tm_topology_t *topology,int N,double **comm,double **arch){
+  CLOCK_T time1,time0;
+  double duration; 
+  int *sol;
+  sol=(int*)malloc(sizeof(int)*N);
+
+  map_Packed(topology,N,sol);
+  printf("Packed: "); 
+  print_sol(N,sol,comm,arch);
+
+
+
+  map_RR(N,sol);
+  printf("RR: "); 
+  print_sol(N,sol,comm,arch);
+
+  CLOCK(time0);
+  map_MPIPP(topology,1,N,sol,comm,arch);
+  CLOCK(time1);
+  duration=CLOCK_DIFF(time1,time0);
+  printf("MPIPP-1-D:%f\n",duration);
+  printf("MPIPP-1: ");
+  print_sol(N,sol,comm,arch);
+
+  CLOCK(time0);
+  map_MPIPP(topology,5,N,sol,comm,arch);
+  CLOCK(time1);
+  duration=CLOCK_DIFF(time1,time0);
+  printf("MPIPP-5-D:%f\n",duration);
+  printf("MPIPP-5: ");
+  print_sol(N,sol,comm,arch);
+
+  free(sol);
+}
+
+
diff --git a/src/ck-ldb/treemapping.h b/src/ck-ldb/treemapping.h
new file mode 100644 (file)
index 0000000..70f2e19
--- /dev/null
@@ -0,0 +1,23 @@
+#include "treematch.h"
+int  build_comm(char *filename,double ***pcomm);
+void TreeMatchMapping(int nb_obj, int nb_proc,double **comm_mat,  int *sol);
+double print_sol(int N,int *Value,double **comm, double **arch);
+
+/*Map topology to cores: 
+ sigma_i is such that  process i is mapped on core sigma_i
+ k_i is such that core i exectutes process k_i
+
+ size of sigma is the number of process (nb_objs)
+ size of k is the number of cores/nodes (nb_proc)
+
+ We must have numbe of process<=number of cores
+
+ k_i =-1 if no process is mapped on core i
+*/
+void map_topology_simple(tm_topology_t *topology,tree_t *comm_tree, int *sigma,int *k);
+
+int nb_nodes(tm_topology_t *topology);
+void free_topology(tm_topology_t *topology);
+void display_other_heuristics(tm_topology_t *topology,int N,double **comm,double **arch);
+void print_1D_tab(int *tab,int N);
+void   build_synthetic_proc_id(tm_topology_t *topology);
diff --git a/src/ck-ldb/treematch.c b/src/ck-ldb/treematch.c
new file mode 100644 (file)
index 0000000..7b9df43
--- /dev/null
@@ -0,0 +1,1111 @@
+#include <float.h>
+#include "treematch.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "treetimings.h"
+#include "treebucket.h"
+#include <assert.h>
+
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#undef DEBUG
+
+
+void free_list_child(tree_t *tree){
+  int i;
+  if(tree){
+    for(i=0;i<tree->arity;i++){
+      free_list_child(tree->child[i]);
+    }
+  }
+  free(tree->child);
+  if(tree->dumb)
+    free(tree);
+}
+
+
+
+
+void free_tab_child(tree_t *tree){
+  if(tree){
+    free_tab_child(tree->tab_child);
+    free(tree->tab_child);
+  }
+}
+
+
+
+void free_tree(tree_t *tree){
+
+  free_list_child(tree);
+  free_tab_child(tree);
+  free(tree);
+}
+
+long int choose (long n,long k){
+  // compute C_n_k
+  double res=1;
+  int i;
+  for(i=0;i<k;i++)
+    res*=(double)(n-i)/(double)(k-i);
+  
+  return (long int)res;
+}
+
+void set_node(tree_t *node,tree_t ** child, int arity,tree_t *parent,int id,double val,tree_t *tab_child){
+  static int uniq=0;
+  node->child=child;
+  node->arity=arity;
+  node->tab_child=NULL;
+  node->parent=parent;
+  node->id=id;
+  node->val=val;
+  node->uniq=uniq++;
+  node->dumb=0;
+}
+
+void display_node(tree_t *node){
+  printf("child : %p\narity : %d\nparent : %p\nid : %d\nval : %f\nuniq : %d\n\n" ,
+        node->child,node->arity,node->parent,node->id,node->val,node->uniq);
+}
+void clone_tree(tree_t *new,tree_t *old){
+  int i;
+  new->child=old->child;
+  new->arity=old->arity;
+  new->parent=old->parent;
+  //new->deb_tab_child=old->deb_tab_child;
+  new->id=old->id;
+  new->val=old->val;
+  new->uniq=old->uniq;
+  new->dumb=old->dumb;
+  for(i=0;i<new->arity;i++)
+    new->child[i]->parent=new;
+}
+
+
+
+double **aggregate_tab(tree_t *new_tab_node,double **tab,int N,int M){
+  int i,j,i1,j1,id1,id2;
+  double **res;
+    
+  res=(double**)malloc(M*sizeof(double*));
+  for(i=0;i<M;i++)
+    res[i]=(double*)malloc(M*sizeof(double));
+  
+  for(i=0;i<M;i++){
+    for(j=0;j<M;j++){
+      if(i==j)
+       res[i][j]=0;
+      else{
+       res[i][j]=0;
+       for(i1=0;i1<new_tab_node[i].arity;i1++){
+         id1=new_tab_node[i].child[i1]->id;
+         for(j1=0;j1<new_tab_node[j].arity;j1++){
+           id2=new_tab_node[j].child[j1]->id;
+           res[i][j]+=tab[id1][id2];
+           //printf("res[%d][%d]+=tab[%d][%d]=%f\n",i,j,id1,id2,tab[id1][id2]);
+         }
+       }
+      }
+    }
+  }
+  
+  return res;
+
+}
+
+void free_tab_double(double**tab,int N){
+  int i;
+  for(i=0;i<N;i++){
+    free(tab[i]);
+  }
+  free(tab);
+}
+
+void free_tab_int(int**tab,int N){
+  int i;
+  for(i=0;i<N;i++){
+    free(tab[i]);
+  }
+  free(tab);
+}
+
+void display_tab(double **tab,int N){
+  int i,j;
+  double line,total=0;;
+  for(i=0;i<N;i++){
+    line=0;
+    for(j=0;j<N;j++){
+      printf("%8.2f ",tab[i][j]);
+      line+=tab[i][j];
+    }
+    total+=line;
+    //printf(": %.2f",line);
+    printf("\n");
+  }
+  printf("Total: %.2f\n",total);
+}
+
+
+double eval_grouping(double **tab,tree_t **cur_group,int arity,int N){
+  double res=0;
+  int i,k,j,id,id1,id2;
+  //display_tab(tab,N);
+
+
+  for(i=0;i<arity;i++){
+    id=cur_group[i]->id;
+    //printf("%d ",id);
+    for(k=0;k<N;k++){
+      //printf("res+=tab[%d][%d]+tab[%d][%d]=%f+%f \n",k,id,id,k,tab[k][id],tab[id][k]);
+      res+=tab[k][id];
+    }
+  }
+  
+  for(i=0;i<arity;i++){
+    id1=cur_group[i]->id;
+    for(j=0;j<arity;j++){
+      id2=cur_group[j]->id;
+      //printf("res-=tab[%d][%d]=%f\n",id1,id2,tab[id1][id2]);
+      res-=tab[id1][id2];
+    }
+  }
+  //printf(" = %f\n",res);
+  return res;
+}
+
+
+
+group_list_t *new_group_list(tree_t **tab,double val,group_list_t *next){
+  group_list_t *res;
+  res=(group_list_t *)malloc(sizeof(group_list_t));
+  res->tab=tab;
+  res->val=val;
+  res->next=next;
+  res->sum_neighbour=0;
+  return res;
+}
+
+
+void add_to_list(group_list_t *list,tree_t **cur_group, int arity, double val){
+
+  group_list_t *elem;
+  int i;
+  tree_t **tab;
+  
+  tab=(tree_t **)malloc(sizeof(tree_t *)*arity);
+
+  for(i=0;i<arity;i++){
+    tab[i]=cur_group[i];
+    //printf("%d ",cur_group[i]->id);
+  }
+
+  //printf("\n");
+  elem=new_group_list(tab,val,list->next);
+  list->next=elem;
+  list->val++;
+}
+
+
+
+void  list_all_possible_groups(double **tab,tree_t *tab_node,int id,int arity, int depth,
+                              tree_t **cur_group,int N,group_list_t *list){
+  double val;
+  int i;
+
+
+
+  if(depth==arity){
+    val=eval_grouping(tab,cur_group,arity,N);
+    add_to_list(list,cur_group,arity,val);
+    return;
+    }else if(N+depth>=arity+id){
+    //}else if(1){
+    for(i=id;i<N;i++){
+      if(tab_node[i].parent)
+       continue;
+      cur_group[depth]=&tab_node[i];
+      //printf("%d<-%d\n",depth,i);
+      list_all_possible_groups(tab,tab_node,i+1,arity,depth+1,cur_group,N,list);
+    }
+  }
+}
+
+
+void update_val(double **tab,tree_t *parent,int N){
+  int i;
+
+  parent->val=eval_grouping(tab,parent->child,parent->arity,N);
+  //printf("connecting: ");
+  for(i=0;i<parent->arity;i++){
+    //printf("%d ",parent->child[i]->id);
+    /*  if(parent->child[i]->parent!=parent){
+      parent->child[i]->parent=parent;
+    }else{
+      fprintf(stderr,"redundant operation!\n");
+      exit(-1);
+      }*/
+  }
+
+
+  //printf(": %f\n",parent->val);
+}
+
+
+int independent_groups(group_list_t **selection,int d,group_list_t *elem,int arity){
+  int i,j,k;
+
+  if(d==0)
+    return 1;
+
+  for(i=0;i<arity;i++){
+    for(j=0;j<d;j++){
+      for(k=0;k<arity;k++){
+       if(elem->tab[i]->id==selection[j]->tab[k]->id)
+         return 0;
+      }
+    }
+  }
+  
+  return 1;
+
+
+}
+
+void display_selection (group_list_t** selection,int M,int arity,double val){
+  int i,j;
+  for(i=0;i<M;i++){
+    for(j=0;j<arity;j++)
+      printf("%d ",selection[i]->tab[j]->id);
+    printf("-- ");
+  }
+  printf(":%f\n",val); 
+
+}
+
+void display_grouping (tree_t *father,int M,int arity,double val){
+  int i,j;
+  for(i=0;i<M;i++){
+    for(j=0;j<arity;j++)
+      printf("%d ",father[i].child[j]->id);
+    printf("-- ");
+  }
+  printf(":%f\n",val); 
+
+}
+
+
+int recurs_select_independent_groups(group_list_t **tab,int i,int n,int arity,int d,int M,double val,double *best_val,group_list_t **selection,group_list_t **best_selection){
+  group_list_t *elem;
+
+  //if(val>=*best_val)
+  // return 0;
+
+  if(d==M){
+    //display_selection(selection,M,arity,val);
+    if(val<*best_val){
+      *best_val=val;
+      for(i=0;i<M;i++)
+       best_selection[i]=selection[i];
+      return 1;
+    }
+    return 0;
+  }
+  
+  while(i<n){
+    elem=tab[i];
+    if(independent_groups(selection,d,elem,arity)){
+      //printf("%d: %d\n",d,i);
+      selection[d]=elem;
+      val+=elem->val;
+      return recurs_select_independent_groups(tab,i+1,n,arity,d+1,M,val,best_val,selection,best_selection);
+    }
+    i++;
+  }
+  return 0;
+}
+
+
+int test_independent_groups(group_list_t **tab,int i,int n,int arity,int d,int M,double val,double *best_val,group_list_t **selection,group_list_t **best_selection){
+  group_list_t *elem;
+
+  if(d==M){
+    //display_selection(selection,M,arity,val);
+    return 1;
+  }
+  
+  while(i<n){
+    elem=tab[i];
+    if(independent_groups(selection,d,elem,arity)){
+      //printf("%d: %d\n",d,i);
+      selection[d]=elem;
+      val+=elem->val;
+      return recurs_select_independent_groups(tab,i+1,n,arity,d+1,M,val,best_val,selection,best_selection);
+    }
+    i++;
+  }
+  return 0;
+}
+
+void  delete_group_list(group_list_t *list){
+
+  if(list){
+    delete_group_list(list->next);
+    free(list->tab);
+    free(list);
+  }
+}
+
+
+int group_list_id(const void* x1,const void* x2){ 
+
+  group_list_t *e1,*e2;
+
+  e1=*((group_list_t**)x1);
+  e2=*((group_list_t**)x2);
+
+  
+  return e1->tab[0]->id<e2->tab[0]->id?-1:1;
+} 
+
+int group_list_asc(const void* x1,const void* x2){ 
+
+  group_list_t *e1,*e2;
+
+  e1=*((group_list_t**)x1);
+  e2=*((group_list_t**)x2);
+
+  
+  return e1->val<e2->val?-1:1;
+} 
+
+
+int group_list_dsc(const void* x1,const void* x2){ 
+
+  group_list_t *e1,*e2;
+
+  e1=*((group_list_t**)x1);
+  e2=*((group_list_t**)x2);
+
+  
+  return e1->val>e2->val?-1:1;
+} 
+
+
+int weighted_degree_asc(const void* x1,const void* x2){ 
+
+  group_list_t *e1,*e2;
+
+  e1=*((group_list_t**)x1);
+  e2=*((group_list_t**)x2);
+
+  
+  return e1->wg>e2->wg?1:-1;
+} 
+
+
+int weighted_degree_dsc(const void* x1,const void* x2){ 
+
+  group_list_t *e1,*e2;
+
+  e1=*((group_list_t**)x1);
+  e2=*((group_list_t**)x2);
+
+  
+  return e1->wg>e2->wg?-1:1;
+} 
+
+int  select_independent_groups(group_list_t **tab_group,int n,int arity,int M,double *best_val,group_list_t **best_selection,int bound,double max_duration){
+  int i;
+  group_list_t **selection;
+  double val,duration;
+  CLOCK_T time1,time0;
+
+
+  selection=(group_list_t **)malloc(sizeof(group_list_t*)*M);
+  CLOCK(time0);
+  for(i=0;i<MIN(bound,n);i++){
+    selection[0]=tab_group[i];
+    val=tab_group[i]->val;
+    recurs_select_independent_groups(tab_group,i+1,n,arity,1,M,val,best_val,selection,best_selection);
+    if(i%5){
+      CLOCK(time1);
+      duration=CLOCK_DIFF(time1,time0);
+      if(duration>max_duration){
+       free(selection);
+       return 1;
+      }
+    }
+  }
+
+  free(selection);
+
+#ifdef DEBUG
+  display_selection(best_selection,M,arity,*best_val);
+#endif
+  return 0;
+}
+
+int  select_independent_groups_by_largest_index(group_list_t **tab_group,int n,int arity,int M,double *best_val,group_list_t **best_selection,int bound,double max_duration){
+  int i,nb_groups=0;
+  group_list_t **selection;
+  double val,duration;
+  int dec;
+  CLOCK_T time1,time0;
+
+  selection=(group_list_t **)malloc(sizeof(group_list_t*)*M);
+  CLOCK(time0);
+  
+  dec=MAX(n/10000,1);
+  for(i=n-1;i>=0;i-=dec*dec){
+    selection[0]=tab_group[i];
+    val=tab_group[i]->val;
+    nb_groups+=test_independent_groups(tab_group,i+1,n,arity,1,M,val,best_val,selection,best_selection);
+    //printf("%d:%d\n",i,nb_groups);
+    if(nb_groups>=bound){
+      free(selection);
+      return 0;
+    }
+    if(i%5){
+      CLOCK(time1);
+      duration=CLOCK_DIFF(time1,time0);
+      if(duration>max_duration){
+       free(selection);
+       return 1;
+      }
+    }
+  }
+
+  free(selection);
+  return 0;
+}
+
+void list_to_tab(group_list_t *list,group_list_t **tab,int n){
+  int i;
+  for(i=0;i<n;i++){
+    if(!list){
+      fprintf(stderr,"Error not enough elements. Only %d on %d\n",i,n);
+      exit(-1);
+    }
+    tab[n-i-1]=list;
+    list=list->next;
+  }
+  if(list){
+    fprintf(stderr,"Error too many elements\n");
+    exit(-1);
+  }
+}
+
+void display_tab_group(group_list_t **tab, int n,int arity){
+  int i,j;
+  for(i=0;i<n;i++){
+    for(j=0;j<arity;j++)
+      printf("%d ",tab[i]->tab[j]->id);
+    printf(": %.2f %.2f\n",tab[i]->val,tab[i]->wg);
+  }
+}
+
+int independent_tab(tree_t **tab1,tree_t **tab2,int n){
+  int i,j;
+  i=0;j=0;
+  while((i<n)&&(j<n)){
+    if(tab1[i]->id==tab2[j]->id)
+      return 0;
+    else if(tab1[i]->id>tab2[j]->id)
+      j++;
+    else
+      i++;
+  }
+  return 1;
+}
+
+    
+
+void compute_weighted_degree(group_list_t **tab, int n,int arity){
+  int i,j;
+  for(i=0;i<n;i++)
+    tab[i]->sum_neighbour=0;
+  for(i=0;i<n;i++){
+    //printf("%d/%d=%f%%\n",i,n,(100.0*i)/n);
+    for(j=i+1;j<n;j++){
+      //if(!independent_groups(&tab[i],1,tab[j],arity)){
+       if(!independent_tab(tab[i]->tab,tab[j]->tab,arity)){
+       tab[i]->sum_neighbour+=tab[j]->val;
+       tab[j]->sum_neighbour+=tab[i]->val;
+     }
+    }
+
+    tab[i]->wg=tab[i]->sum_neighbour/tab[i]->val;
+    if(tab[i]->sum_neighbour==0)
+      tab[i]->wg=0;
+    //printf("%d:%f/%f=%f\n",i,tab[i]->sum_neighbour,tab[i]->val,tab[i]->wg);
+  }
+}
+
+
+/*
+  Very slow: explore all possibilities
+  tab: comm_matrix at the considered level (used to evaluate a grouping)
+  tab_node: array of the node to group
+  parent: node to which attached the computed group
+  id: current considered node of tab_node
+  arity: number of children of parent (i.e.) size of the group to compute
+  best_val: current value of th grouping
+  cur_group: current grouping
+ */
+void  group(double **tab,tree_t *tab_node,tree_t *parent,int id,int arity, int n,double *best_val,tree_t **cur_group,int N){
+  double val;
+  int i;
+
+
+  //if we have found enough noide in the group
+  if(n==arity){
+    // evaluate this group
+    val=eval_grouping(tab,cur_group,arity,N);
+    // If we improve compared to previous grouping: uodate the children of parent accordingly
+    if(val<*best_val){
+      *best_val=val;
+       for(i=0;i<arity;i++){
+         parent->child[i]=cur_group[i];
+       }
+      parent->arity=arity;
+    } 
+    return;
+  }
+  
+
+
+  // If we need more node in the group
+  // Continue to explore avilable nodes
+  for(i=id+1;i<N;i++){
+    // If this node is allready in a group: skip it
+    if(tab_node[i].parent)
+      continue;
+    //Otherwise, add it to the group at place n
+    cur_group[n]=&tab_node[i];
+    //printf("%d<-%d\n",n,i);
+    //recursively add the next element to this group
+    group(tab,tab_node,parent,i,arity,n+1,best_val,cur_group,N);
+  }
+}
+
+/*
+   tab: comm_matrix at the considered level (use to evaluate a grouping)
+  tab_node: array of the node to group
+  parent: node to which attached the computed group
+  id: current considered node of tab_node
+  arity: number of children of parent (i.e.) size of the group to compute
+  best_val: current value of th grouping
+  cur_group: current grouping
+  N: size of tab and tab_node. i.e. number of nodes at the considered level
+ */
+void  fast_group(double **tab,tree_t *tab_node,tree_t *parent,int id,int arity, int n,
+                double *best_val,tree_t **cur_group,int N, int *nb_groups,int max_groups){
+  double val;
+  int i;
+
+  //printf("Max groups=%d\n",max_groups);
+
+  //if we have found enough node in the group
+  if(n==arity){
+    (*nb_groups)++;
+    // evaluate this group
+    val=eval_grouping(tab,cur_group,arity,N);
+    // If we improve compared to previous grouping: uodate the children of parent accordingly
+    if(val<*best_val){
+      *best_val=val;
+       for(i=0;i<arity;i++){
+         parent->child[i]=cur_group[i];
+       }
+      parent->arity=arity;
+    } 
+    return;
+  }
+  
+
+
+  // If we need more node in the group
+  // Continue to explore avilable nodes
+  for(i=id+1;i<N;i++){
+    // If this node is allready in a group: skip it
+    if(tab_node[i].parent)
+      continue;
+    //Otherwise, add it to the group at place n
+    cur_group[n]=&tab_node[i];
+    //printf("%d<-%d %d/%d\n",n,i,*nb_groups,max_groups);
+    //exit(-1);
+    //recursively add the next element to this group
+    fast_group(tab,tab_node,parent,i,arity,n+1,best_val,cur_group,N,nb_groups,max_groups);
+    if(*nb_groups>max_groups)
+      return;
+  }
+}
+
+
+
+
+void fast_grouping(double **tab,tree_t *tab_node, tree_t *new_tab_node, int arity,int N, int M,long int k){
+  tree_t **cur_group;
+  int l,i;
+  double best_val,val=0;;
+  int nb_groups;
+
+  cur_group=(tree_t**)malloc(sizeof(tree_t*)*arity);
+  for(l=0;l<M;l++){
+    best_val=DBL_MAX;
+    nb_groups=0;
+    //printf("k%d/%d, k=%ld\n",l,M,k);
+    /* select the best greedy grouping among the 10 first one*/
+    //fast_group(tab,tab_node,&new_tab_node[l],-1,arity,0,&best_val,cur_group,N,&nb_groups,MAX(2,(int)(50-log2(k))-M/10));
+    fast_group(tab,tab_node,&new_tab_node[l],-1,arity,0,&best_val,cur_group,N,&nb_groups,MAX(1,(int)(50-log2(k))-M/10));
+    val+=best_val;
+    for(i=0;i<new_tab_node[l].arity;i++){
+      new_tab_node[l].child[i]->parent=&new_tab_node[l];
+    }
+    update_val(tab,&new_tab_node[l],N);
+  }
+
+  free(cur_group);  
+
+  printf("val=%f\n",val);
+  //exit(-1);
+
+#ifdef DEBUG
+  display_grouping(new_tab_node,M,arity,val);
+#endif
+}
+
+
+
+
+int adjacency_asc(const void* x1,const void* x2){ 
+
+  adjacency_t *e1,*e2;
+
+  e1=((adjacency_t*)x1);
+  e2=((adjacency_t*)x2);
+
+  
+  return e1->val<e2->val?-1:1;
+} 
+
+
+int adjacency_dsc(const void* x1,const void* x2){ 
+
+  adjacency_t *e1,*e2;
+
+  e1=((adjacency_t*)x1);
+  e2=((adjacency_t*)x2);
+
+  
+  return e1->val>e2->val?-1:1;
+} 
+void super_fast_grouping(double **tab,tree_t *tab_node, tree_t *new_tab_node, int arity,int N, int M,int k){
+  double val=0;
+  adjacency_t *graph;
+  int i,j,e,l,nb_groups;
+  double duration;
+
+  assert(arity==2);
+
+  TIC;
+  graph=(adjacency_t*)malloc(sizeof(adjacency_t)*((N*N-N)/2));
+  e=0;
+  for(i=0;i<N;i++){
+    for(j=i+1;j<N;j++){
+      graph[e].i=i;
+      graph[e].j=j;
+      graph[e].val=tab[i][j];
+      e++;
+    }
+  }
+  duration=TOC;
+  printf("linearization=%fs\n",duration);
+  
+
+  assert(e==(N*N-N)/2);
+  TIC;  
+  qsort(graph,e,sizeof(adjacency_t),adjacency_dsc);
+  duration=TOC;
+  
+  printf("sorting=%fs\n",duration);
+
+  TIC;
+
+TIC;
+  l=0;
+  nb_groups=0;
+  for(i=0;i<e&&l<M;i++){
+    if(try_add_edge(tab,tab_node,&new_tab_node[l],arity,graph[i].i,graph[i].j,N,&nb_groups)){
+      l++;
+    }
+  }
+
+  for(l=0;l<M;l++){
+    update_val(tab,&new_tab_node[l],N);      
+    val+=new_tab_node[l].val;
+  }
+
+  duration=TOC;
+  printf("Grouping=%fs\n",duration);
+
+  printf("val=%f\n",val);
+
+#ifdef DEBUG
+  display_grouping(new_tab_node,M,arity,val);
+#endif
+}
+
+
+
+
+/*
+  tab: comm_matrix at the considered level (use to evaluate a grouping)
+  tab_node: array of the node to group
+  new_tab_node: array of nodes at the next level (the parents of the node in tab_node once the grouping will be done). 
+  arity: number of children of parent (i.e.) size of the group to compute
+  N: size of tab and tab_node. i.e. number of nodes at the considered level
+  M: size of new_tab_node (i.e) the number of parents
+  Hence we have: M*arity=N
+ */void group_nodes(double **tab,tree_t *tab_node, tree_t *new_tab_node, int arity,int N, int M){
+  tree_t **cur_group;
+  int j,l,n;
+  group_list_t list,**best_selection,**tab_group;
+  double best_val,last_best;
+  int timeout;
+
+
+  long int k=choose(N,arity);
+  printf("Number of groups:%ld\n",k);
+  
+  if(k>30000){
+#ifdef DEBUG
+    printf("Fast Grouping...\n");
+#endif
+
+    double duration;
+    
+    TIC;
+    if(arity<=2){
+      bucket_grouping(tab,tab_node,new_tab_node,arity,N,M,k);
+    }else{
+      fast_grouping(tab,tab_node,new_tab_node,arity,N,M,k);
+    }
+    duration=TOC;
+
+    printf("Fast grouping duration=%f\n",duration);
+    
+    //exit(-1);
+  }else{
+#ifdef DEBUG
+    printf("Grouping nodes...\n");
+#endif
+    list.next=NULL;
+    list.val=0;//number of elements in the list
+    cur_group=(tree_t**)malloc(sizeof(tree_t*)*arity);
+    best_selection=(group_list_t **)malloc(sizeof(group_list_t*)*M);
+  
+    list_all_possible_groups(tab,tab_node,0,arity,0,cur_group,N,&list);
+    n=(int)list.val;
+    assert(n==k);
+    tab_group=(group_list_t**)malloc(sizeof(group_list_t)*n);
+    list_to_tab(list.next,tab_group,n);
+#ifdef DEBUG
+    printf("List to tab done\n");
+#endif
+    best_val=DBL_MAX;
+  
+    // perform the pack mapping fist
+    timeout=select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,1,0.1);
+#ifdef DEBUG
+    if(timeout){
+      printf("Packed mapping timeout!\n");
+    }
+#endif
+    // give this mapping an exra credit (in general MPI application are made such that neighbour process communicates more than distant ones)
+    best_val/=1.001;
+#ifdef DEBUG
+    printf("Packing computed\n");
+#endif
+    // perform a mapping trying to use group that cost less first
+    qsort(tab_group,n,sizeof(group_list_t*),group_list_asc);
+    last_best=best_val;
+    timeout=select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,10,0.1);
+#ifdef DEBUG
+    if(timeout){
+      printf("Cost less first timeout!\n");
+    }else if(last_best>best_val){
+      printf("Cost less first Impoved solution\n");
+    }
+    printf("----\n");
+#endif
+    // perform a mapping trying to minimize the use of groups that cost a lot 
+    qsort(tab_group,n,sizeof(group_list_t*),group_list_dsc);
+    last_best=best_val;
+    timeout=select_independent_groups_by_largest_index(tab_group,n,arity,M,&best_val,best_selection,10,0.1);
+#ifdef DEBUG
+    if(timeout){
+      printf("Cost most last timeout!\n");
+    }else if(last_best>best_val){
+      printf("Cost most last impoved solution\n");
+    }
+#endif
+    if(n<10000){
+      // perform a mapping in the weighted degree order 
+#ifdef DEBUG
+      printf("----WG----\n");
+#endif
+      compute_weighted_degree(tab_group,n,arity);
+#ifdef DEBUG
+      printf("Weigted degree computed\n");
+#endif
+      qsort(tab_group,n,sizeof(group_list_t*),weighted_degree_dsc);
+      //display_tab_group(tab_group,n,arity);
+      last_best=best_val;
+      timeout=select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,10,0.1);
+#ifdef DEBUG
+      if(timeout){
+       printf("WG timeout!\n");
+      }else if(last_best>best_val){
+       printf("WG impoved solution\n");
+      }
+#endif
+    }
+    
+    qsort(best_selection,M,sizeof(group_list_t*),group_list_id);
+
+    for(l=0;l<M;l++){
+      for(j=0;j<arity;j++){
+       new_tab_node[l].child[j]=best_selection[l]->tab[j];
+       new_tab_node[l].child[j]->parent=&new_tab_node[l];
+      }
+      new_tab_node[l].arity=arity;
+    
+      //printf("arity=%d\n",new_tab_node[l].arity);
+      update_val(tab,&new_tab_node[l],N);
+    }
+    
+    delete_group_list((&list)->next);
+    free(best_selection);
+    free(tab_group);
+    free(cur_group);  
+  }
+
+  printf("Grouping done!\n");
+}
+
+
+
+
+
+void complete_tab(double ***tab,int N, int K){
+  double **old_tab,**new_tab;
+  int M,i,j;
+  if(K==0)
+    return;
+
+  old_tab=*tab;
+  
+  M=N+K;
+  new_tab=(double**)malloc(M*sizeof(double*));
+  for(i=0;i<M;i++)
+    new_tab[i]=(double*)malloc(M*sizeof(double));
+  
+  *tab=new_tab;
+  for(i=0;i<M;i++){
+    for(j=0;j<M;j++){
+      if((i<N)&&(j<N)){
+       new_tab[i][j]=old_tab[i][j];
+      }else{
+       new_tab[i][j]=0;
+      }
+    }
+  }
+}
+
+void create_dumb_tree(tree_t *node,int depth,tm_topology_t *topology){
+  tree_t **list_child;
+  int arity,i;
+
+  
+  if(depth==topology->nb_levels-1){
+    set_node(node,NULL,0,NULL,-1,0,NULL);  
+    return;
+  }
+
+  arity=topology->arity[depth];
+  assert(arity>0);
+  list_child=(tree_t**)calloc(arity,sizeof(tree_t*));
+  for(i=0;i<arity;i++){
+    list_child[i]=(tree_t*)malloc(sizeof(tree_t));
+    create_dumb_tree(list_child[i],depth+1,topology);
+    list_child[i]->parent=node;
+    list_child[i]->dumb=1;
+  }
+
+  set_node(node,list_child,arity,NULL,-1,0,list_child[0]);
+  
+}
+
+void complete_tab_node(tree_t **tab,int N, int K,int depth,tm_topology_t *topology){
+  tree_t *old_tab,*new_tab;
+  int M,i;
+  if(K==0)
+    return;
+
+  old_tab=*tab;
+  
+  M=N+K;
+  new_tab=(tree_t*)malloc(M*sizeof(tree_t));
+  
+  *tab=new_tab;
+  for(i=0;i<M;i++){
+    if((i<N)){
+      clone_tree(&new_tab[i],&old_tab[i]);
+    }else{
+      create_dumb_tree(&new_tab[i],depth,topology);
+      new_tab[i].id=i;
+    }
+  }
+
+  //do not suppress tab if you are at the depth-most level it will be used at the mapping stage
+  free(old_tab);  
+}
+
+
+void set_deb_tab_child(tree_t *tree, tree_t *child,int depth){
+  //printf("depth=%d\t%p\t%p\n",depth,child,tree);
+  if(depth>0)
+    set_deb_tab_child(tree->tab_child,child,depth-1);
+  else
+    tree->tab_child=child;
+}
+
+
+
+/* 
+Build the tree of the matching. It is a bottom up algorithm: it starts from the bottom of the tree on proceed by decreasing the depth 
+It groups nodes of the matrix tab and link these groups to the nodes of the under level. 
+Then it calls recursivcely the function to prefrom the grouping at the above level. 
+
+tab_node: array of nodes of the under level. 
+tab: local communication matrix 
+N: number of nodes.
+arity: arity of the nodes of the above level.
+depth: current depth of the algorithm
+toplogy: description of the hardware topology.  
+*/
+tree_t *build_level_topology(tree_t *tab_node,double **com_mat,int N,int arity,int depth,tm_topology_t *topology){
+  int M; /*N/Arity: number the groups*/
+  int K=0,i;
+  tree_t *new_tab_node; /*array of node for this level (of size M): there will be linked to the nodes of tab_nodes*/
+  double **new_com_mat; /*New communication matrix (after grouyping nodes together)*/
+  tree_t *res; /*resulting tree*/
+  int completed=0;
+
+  if((depth==0)){
+    if((N==1)&&(depth==0))
+      return &tab_node[0];
+    else{
+      fprintf(stderr,"Error: matrix size: %d and depth:%d (should be 1 and -1 respectively)\n",N,depth);
+      exit(-1);
+    }
+  }
+
+  
+  /* If the number of nodes does not devide teh aruty: we add K nodes  */
+  if(N%arity!=0){
+    K=arity*((N/arity)+1)-N;
+    //printf("****N=%d arity=%d K=%d\n",N,arity,K);  
+    //display_tab(tab,N);
+    /* add K rows and columns to tab*/
+    complete_tab(&com_mat,N,K);
+    //display_tab(tab,N+K);
+    /* add a dumb tree to the K new "virtual nodes"*/
+    complete_tab_node(&tab_node,N,K,depth,topology);
+    completed=1; /*flag this addition*/
+    N+=K; /*increase the number of nodes accordingly*/
+  } //display_tab(tab,N);
+
+
+  M=N/arity;
+  printf("Depth=%d\tnb_nodes=%d\tnb_groups=%d\tsize of groups(arity)=%d\n",depth,N,M,arity);
+
+  /*create the new nodes*/
+  new_tab_node=(tree_t*)malloc(sizeof(tree_t)*M);
+  /*intitialize each node*/
+  for(i=0;i<M;i++){
+    tree_t **list_child;
+    list_child=(tree_t**)calloc(arity,sizeof(tree_t*));
+    set_node(&new_tab_node[i],list_child,arity,NULL,i,0,tab_node);
+  }
+
+  /*Core of the algorithm: perfrom the grouping*/
+  group_nodes(com_mat,tab_node,new_tab_node,arity,N,M);
+  /*based on that grouping aggregate the communication matrix*/
+  new_com_mat=aggregate_tab(new_tab_node,com_mat,N,M);
+
+  /* set ID of virtual nodes to -1*/
+  for(i=N-K;i<N;i++)
+    tab_node[i].id=-1;
+
+  //for(i=0;i<N;i++)
+  //  display_node(&tab_node[i]);
+
+  //display_tab(new_com_mat,M);
+
+  /* decrease depth and compute arity of the above level*/
+  depth--;
+  if(depth>0)
+    arity = topology->arity[depth-1];
+  else
+    arity=1;
+  // assume all objects have the same arity
+  res = build_level_topology(new_tab_node,new_com_mat,M,arity,depth,topology);  
+
+  set_deb_tab_child(res,tab_node,depth);
+
+
+  if(completed){
+    free_tab_double(com_mat,N);
+  }
+  free_tab_double(new_com_mat,M);
+
+
+  return res;
+}
+
+
+
+double speed(int depth){
+  // Bertha values
+  //double tab[5]={21,9,4.5,2.5,0.001};
+  //double tab[5]={1,1,1,1,1};
+  //double tab[6]={100000,10000,1000,500,100,10};
+  double tab[5]={100000,10000,1000,500,10};
+    
+  return 1.0/tab[depth];
+  //return 10*log(depth+2);
+  //return (depth+1);
+  //return (long int)pow(100,depth);
+}
+
+tree_t * build_tree_from_topology(tm_topology_t *topology,double **tab,int N){
+  int depth,i; 
+  tree_t *res,*tab_node;
+
+
+  tab_node=(tree_t*)malloc(sizeof(tree_t)*N);
+  for(i=0;i<N;i++)
+    set_node(&tab_node[i],NULL,0,NULL,i,0,NULL); 
+
+  depth = topology->nb_levels -1;
+  printf("nb_levels=%d\n",depth+1);
+  // assume all objects have the same arity
+  res = build_level_topology(tab_node,tab,N,topology->arity[depth-1],depth,topology);
+  printf("Build tree done!\n");
+  return res;
+}
diff --git a/src/ck-ldb/treematch.h b/src/ck-ldb/treematch.h
new file mode 100644 (file)
index 0000000..259cec4
--- /dev/null
@@ -0,0 +1,56 @@
+#ifndef __TREE_H__
+#define __TREE_H__
+#include <stdlib.h>
+
+typedef struct _tree_t{
+  struct _tree_t **child;
+  struct _tree_t *parent;
+  struct _tree_t *tab_child; /*the pointer to be freed*/
+  double val;
+  int arity;
+  int depth;
+  int id;
+  int uniq;
+  int dumb; /* 1 if the node belongs to a dumb tree: hence has to be freed separately*/
+}tree_t;
+
+/* Maximum number of levels in the tree*/
+#define MAX_LEVELS 100
+
+typedef struct {
+  int *arity; /* arity of the nodes of each level*/
+  int nb_levels; /*number of levels of the tree*/ 
+  int *nb_nodes; /*nb of nodes of each level*/
+  int **node_id; /*ID of the nodes of the tree for each level*/
+}tm_topology_t;
+
+
+
+tree_t * build_tree(double **tab,int N);
+tree_t * build_tree_from_topology(tm_topology_t *topology,double **tab,int N);
+void map_tree(tree_t *,tree_t*);
+void display_tab(double **tab,int N);
+double speed(int depth);
+void set_node(tree_t *node,tree_t ** child, int arity,tree_t *parent,int id,double val,tree_t *deb_tab_child);
+void free_tree(tree_t *tree);
+void free_tab_double(double**tab,int N);
+void free_tab_int(int**tab,int N);
+void update_val(double **tab,tree_t *parent,int N);
+
+typedef struct _group_list_t{
+  struct _group_list_t *next;
+  tree_t **tab;
+  double val;
+  double sum_neighbour;
+  double wg;
+}group_list_t;
+
+
+typedef struct{
+  int i;
+  int j;
+  double val;
+}adjacency_t;
+
+#endif
+
diff --git a/src/ck-ldb/treetimings.c b/src/ck-ldb/treetimings.c
new file mode 100644 (file)
index 0000000..1a45354
--- /dev/null
@@ -0,0 +1,27 @@
+#include "treetimings.h"
+
+static CLOCK_T time_tab[MAX_CLOCK];
+static int clock_num=-1;
+
+void get_time(){
+  clock_num++;
+
+  if(clock_num>MAX_CLOCK-1)
+    return;
+
+  
+  CLOCK(time_tab[clock_num]);
+}
+
+double time_diff(CLOCK_T t1){
+  CLOCK_T t2;
+  
+  if(clock_num>MAX_CLOCK-1){
+    clock_num--;
+    return -1.0;
+  }
+
+  CLOCK(t2);
+
+  return CLOCK_DIFF(t2,time_tab[clock_num--]);
+}
diff --git a/src/ck-ldb/treetimings.h b/src/ck-ldb/treetimings.h
new file mode 100644 (file)
index 0000000..9952f9a
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef TIMINGS_H
+#define TIMINGS_H
+
+#include <sys/time.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#define MAX_CLOCK 100
+
+typedef struct timeval CLOCK_T;
+
+
+#define CLOCK(c) gettimeofday(&c,(struct timezone *)NULL)
+#define CLOCK_DIFF(c1,c2)  \
+((double)(c1.tv_sec-c2.tv_sec)+(double)(c1.tv_usec-c2.tv_usec)/1e+6)
+#define CLOCK_DISPLAY(c) fprintf(stderr,"%d.%d",(int)c.tv_sec,(int)c.tv_usec)
+
+double time_diff();
+void get_time();
+
+#define TIC get_time()
+#define TOC time_diff()
+
+#endif /*TIMINGS_H*/
+
index 72e1e084ec6157f29504a010a51b7a867f39e975..d9122c7fe3f567d1a37d8b76bb9b33ede3f3cce4 100644 (file)
@@ -64,5 +64,6 @@ TraceSimple.decl.h TraceSimple.def.h: trace-simple.ci.stamp
 TraceSummary.decl.h TraceSummary.def.h: trace-summary.ci.stamp
 TraceTau.decl.h TraceTau.def.h: trace-Tau.ci.stamp
 TraceUtilization.decl.h TraceUtilization.def.h: trace-utilization.ci.stamp
+TreeMatchLB.decl.h TreeMatchLB.def.h: TreeMatchLB.ci.stamp
 waitqd.decl.h waitqd.def.h: waitqd.ci.stamp
 WSLB.decl.h WSLB.def.h: WSLB.ci.stamp
index 1f61c8817044b4a5a8c8387827a7736c11a13104..0455b9e78bb0135177f74780a00cc520e8b80245 100644 (file)
@@ -1421,11 +1421,12 @@ EveryLB.o: EveryLB.C LBDatabase.h lbdb.h converse.h conv-config.h \
  NeighborCommLB.decl.h NborBaseLB.decl.h NeighborLBMsg.h \
  NeighborLB.decl.h OrbLB.decl.h PhasebyArrayLB.decl.h RandCentLB.decl.h \
  RecBipartLB.decl.h RefineLB.decl.h RefineCommLB.decl.h RotateLB.decl.h \
- ComboCentLB.decl.h GraphPartLB.decl.h GraphBFTLB.decl.h \
- GridCommLB.decl.h GridCommRefineLB.decl.h GridHybridLB.decl.h \
- GridHybridSeedLB.decl.h GridMetisLB.decl.h HbmLB.decl.h HybridLBMsg.h \
- HybridLB.decl.h HybridBaseLB.decl.h RefineKLB.decl.h RefineTopoLB.decl.h \
- TopoCentLB.decl.h TopoLB.decl.h EveryLB.def.h
+ TreeMatchLB.decl.h ComboCentLB.decl.h GraphPartLB.decl.h \
+ GraphBFTLB.decl.h GridCommLB.decl.h GridCommRefineLB.decl.h \
+ GridHybridLB.decl.h GridHybridSeedLB.decl.h GridMetisLB.decl.h \
+ HbmLB.decl.h HybridLBMsg.h HybridLB.decl.h HybridBaseLB.decl.h \
+ RefineKLB.decl.h RefineTopoLB.decl.h TopoCentLB.decl.h TopoLB.decl.h \
+ EveryLB.def.h
        $(CHARMC) -c -I. EveryLB.C
 
 CommonLBs.o: CommonLBs.C LBDatabase.h lbdb.h converse.h conv-config.h \
@@ -1449,7 +1450,7 @@ CommonLBs.o: CommonLBs.C LBDatabase.h lbdb.h converse.h conv-config.h \
  NeighborCommLB.decl.h NborBaseLB.decl.h NeighborLBMsg.h \
  NeighborLB.decl.h OrbLB.decl.h PhasebyArrayLB.decl.h RandCentLB.decl.h \
  RecBipartLB.decl.h RefineLB.decl.h RefineCommLB.decl.h RotateLB.decl.h \
- CommonLBs.def.h
TreeMatchLB.decl.h CommonLBs.def.h
        $(CHARMC) -c -I. CommonLBs.C
 
 BlockLB.o: BlockLB.C BlockLB.decl.h charm++.h charm.h converse.h \
@@ -1737,6 +1738,26 @@ RotateLB.o: RotateLB.C RotateLB.decl.h charm++.h charm.h converse.h \
  RotateLB.def.h
        $(CHARMC) -c -I. RotateLB.C
 
+TreeMatchLB.o: TreeMatchLB.C charm++.h charm.h converse.h conv-config.h \
+ conv-autoconfig.h conv-common.h conv-mach.h conv-mach-opt.h pup_c.h \
+ queueing.h conv-cpm.h conv-cpath.h conv-qd.h conv-random.h conv-lists.h \
+ conv-trace.h persistent.h debug-conv.h pup.h middle.h middle-conv.h \
+ cklists.h ckbitvector.h ckstream.h init.h ckhashtable.h debug-charm.h \
+ simd.h CkMarshall.decl.h charm++.h cksection.h ckcallback.h conv-ccs.h \
+ sockRoutines.h ccs-server.h ckobjQ.h ckreduction.h CkReduction.decl.h \
+ cknodegroupreduction.h CkArrayReductionMgr.decl.h ckmemcheckpoint.h \
+ CkMemCheckpoint.decl.h readonly.h ckarray.h cklocation.h LBDatabase.h \
+ lbdb.h LBDBManager.h LBObj.h LBOM.h LBComm.h LBMachineUtil.h lbdb++.h \
+ LBDatabase.decl.h NullLB.decl.h BaseLB.decl.h CkLocation.decl.h \
+ CkArray.decl.h CkFutures.decl.h charisma.h charisma.decl.h tempo.h \
+ tempo.decl.h waitqd.h waitqd.decl.h sdag.h ckcheckpoint.h \
+ CkCheckpoint.decl.h ckevacuation.h ckarrayreductionmgr.h trace.h \
+ trace-bluegene.h envelope.h treemapping.h treematch.h TreeMatchLB.h \
+ CentralLB.h BaseLB.h LBDatabase.h CentralLB.decl.h charm++.h \
+ BaseLB.decl.h LBDatabase.decl.h CentralLBMsg.h TreeMatchLB.decl.h \
+ TreeMatchLB.def.h
+       $(CHARMC) -c -I. TreeMatchLB.C
+
 ComboCentLB.o: ComboCentLB.C ComboCentLB.h CentralLB.h BaseLB.h \
  LBDatabase.h lbdb.h converse.h conv-config.h conv-autoconfig.h \
  conv-common.h conv-mach.h conv-mach-opt.h pup_c.h queueing.h conv-cpm.h \
@@ -2120,6 +2141,18 @@ manager.o: manager.C manager.h CentralLB.h BaseLB.h LBDatabase.h lbdb.h \
  CentralLB.decl.h CentralLBMsg.h
        $(CHARMC) -c -I. manager.C
 
+treematch.o: treematch.c treematch.h treetimings.h treebucket.h
+       $(CHARMC) -c -I. treematch.c
+
+treetimings.o: treetimings.c treetimings.h
+       $(CHARMC) -c -I. treetimings.c
+
+treebucket.o: treebucket.c treematch.h treebucket.h treetimings.h
+       $(CHARMC) -c -I. treebucket.c
+
+treemapping.o: treemapping.c treetimings.h treematch.h treemapping.h
+       $(CHARMC) -c -I. treemapping.c
+
 blue.o: blue.C cklists.h pup.h converse.h conv-config.h conv-autoconfig.h \
  conv-common.h conv-mach.h conv-mach-opt.h pup_c.h queueing.h conv-cpm.h \
  conv-cpath.h conv-qd.h conv-random.h conv-lists.h conv-trace.h \