restructure gauss-seidel
[charm.git] / tests / charm++ / jacobi3d-gausssiedel / jacobi3d.cc
1 /*****************************************************************************
2  * $Source$
3  * $Author$
4  * $Date$
5  * $Revision$
6  *****************************************************************************/
7
8 /** \file jacobi3d Gauss-Seidel to measure openMP, ckLoop
9  *  Author: Yanhua Sun 
10  *  This does a topological placement for a 3d jacobi.
11  *
12  *      
13  *            *****************
14  *         *               *  *
15  *   ^  *****************     *
16  *   |  *               *     *
17  *   |  *               *     *
18  *   |  *               *     *
19  *   Y  *               *     *
20  *   |  *               *     *
21  *   |  *               *     *
22  *   |  *               *  * 
23  *   ~  *****************    Z
24  *      <------ X ------> 
25  *
26  *   X: left, right --> wrap_x
27  *   Y: top, bottom --> wrap_y
28  *   Z: front, back --> wrap_z
29  */
30
31 #include "TopoManager.h"
32
33 int myrand(int numpes) {
34   next = next * 1103515245 + 12345;
35   return((unsigned)(next/65536) % numpes);
36 }
37
38 // We want to wrap entries around, and because mod operator % 
39 // sometimes misbehaves on negative values. -1 maps to the highest value.
40 class ghostMsg: public CMessage_ghostMsg {
41 public:
42     int dir;
43     int height;
44     int width;
45     double* gh;
46
47     ghostMsg(int _d, int _h, int _w) : dir(_d), height(_h), width(_w) { }
48 };
49
50 extern "C" void doCalc(int first,int last, void *result, int paramNum, void * param)
51 {
52 #if JACOBI
53     int i;
54     double maxDiff=0;
55     double tmp, update=0;
56     maxDiff = 0;
57
58     Jacobi  *obj = (Jacobi*)param;
59     int x_c = arrayDimX/2/blockDimX;
60     int y_c = arrayDimY/2/blockDimY;
61     int z_c = arrayDimZ/2/blockDimZ;
62     int x_p = (arrayDimX/2)%blockDimX+1;
63     int y_p = (arrayDimY/2)%blockDimY+1;
64     int z_p = (arrayDimZ/2)%blockDimZ+1;
65     //CkPrintf(" ckloop on %d  first last %d %d \n", CkMyPe(), first, last);  
66     for(i=first; i<=last; ++i) {
67           for(int j=1; j<blockDimY+1; ++j) {
68               for(int k=1; k<blockDimZ+1; ++k) {
69                   if(obj->thisIndex_x == x_c && y_c == obj->thisIndex_y && z_c == obj->thisIndex_z && i==x_p && j==y_p && k==z_p)
70                       continue;
71                   update = 0;//temperature[index(i, j, k)];
72                   update += obj->temperature[index(i+1, j, k)];
73                   update += obj->temperature[index(i-1, j, k)];
74                   update += obj->temperature[index(i, j+1, k)];
75                   update += obj->temperature[index(i, j-1, k)];
76                   update += obj->temperature[index(i, j, k+1)];
77                   update += obj->temperature[index(i, j, k-1)];
78                   update /= 6;
79                  
80                   double diff = obj->temperature[index(i, j, k)] - update;
81                   if(diff<0) diff = -1*diff;
82                   if(diff>maxDiff) maxDiff = diff;
83                   obj->new_temperature[index(i, j, k)] = update;
84               }
85           }
86         }
87         *((double*)result) = maxDiff;
88 #else
89 #endif 
90 }
91 extern "C" void doUpdate(int first,int last, void *result, int paramNum, void *param){
92 #if JACOBI
93         int i;
94         Jacobi  *obj = (Jacobi*)param;
95         int x_c = arrayDimX/2/blockDimX;
96         int y_c = arrayDimY/2/blockDimY;
97         int z_c = arrayDimZ/2/blockDimZ;
98         int x_p = (arrayDimX/2)%blockDimX+1;
99         int y_p = (arrayDimY/2)%blockDimY+1;
100         int z_p = (arrayDimZ/2)%blockDimZ+1;
101         //CkPrintf("\n {%d:%d:%d  %d:%d:%d  %d:%d:%d }", obj->thisIndex_x, obj->thisIndex_y, obj->thisIndex_z, x_c, y_c, z_c, x_p, y_p, z_p);
102         for(i=first; i<=last; ++i) {
103             for(int j=1; j<blockDimY+1; ++j) {
104                 for(int k=1; k<blockDimZ+1; ++k) {
105                     if(!(obj->thisIndex_x == x_c && y_c == obj->thisIndex_y && z_c == obj->thisIndex_z && i==x_p && j==y_p && k==z_p))
106                         obj->temperature[index(i, j, k)] = obj->new_temperature[index(i, j, k)];
107                     //CkPrintf("[ %d:%d:%d  .%2f ]", i, j, k, obj->temperature[index(i, j, k)]); 
108                 }
109             }
110         }
111        //CkPrintf("\n\n");
112 #else
113 #endif
114 }
115
116
117 Jacobi::Jacobi() {
118       // This call is an anachronism - the need to call __sdag_init() has been
119       // removed. We still call it here to test backward compatibility.
120       __sdag_init();
121
122       int i, j, k;
123       // allocate a three dimensional array
124       temperature = new double[(blockDimX+2) * (blockDimY+2) * (blockDimZ+2)];
125 #if  JACOBI
126       new_temperature = new double[(blockDimX+2) * (blockDimY+2) * (blockDimZ+2)];
127 #endif
128       for(i=0; i<blockDimX+2; ++i) {
129           for(j=0; j<blockDimY+2; ++j) {
130               for(k=0; k<blockDimZ+2; ++k) {
131                   temperature[index(i, j, k)] = 1.0;
132 #if  JACOBI
133                   new_temperature[index(i, j, k)] = 1.0;
134 #endif
135               }
136           } 
137       }
138       thisIndex_x = thisIndex.x;
139       thisIndex_y = thisIndex.y;
140       thisIndex_z = thisIndex.z;
141       iterations = 0;
142       imsg = 0;
143       constrainBC();
144
145       usesAtSync = CmiTrue;
146       neighbors = 6;
147       if(thisIndex.x == 0) 
148           neighbors--;
149       if( thisIndex.x== num_chare_x-1)
150           neighbors--;
151       if(thisIndex.y == 0) 
152           neighbors--;
153       if( thisIndex.y== num_chare_y-1)
154           neighbors--;
155       if(thisIndex.z == 0) 
156           neighbors--;
157       if( thisIndex.z== num_chare_z-1)
158           neighbors--;
159        // CkPrintf("neighbor = %d \n", neighbors);
160     }
161 void Jacobi::pup(PUP::er &p){
162                 
163                 // calling parent's pup
164                 CBase_Jacobi::pup(p);
165                 __sdag_pup(p);
166                 // pupping properties of this class
167                 p | iterations;
168                 p | imsg;
169                 // if unpacking, allocate the memory space
170                 if(p.isUnpacking()){
171                         temperature = new double[(blockDimX+2) * (blockDimY+2) * (blockDimZ+2)];
172                 }
173                 // pupping the arrays
174                 p((char *)temperature, (blockDimX+2) * (blockDimY+2) * (blockDimZ+2) * sizeof(double));
175         }
176     // Send ghost faces to the six neighbors
177     void Jacobi::begin_iteration(void) {
178       iterations++;
179
180       // Copy different faces into messages
181        ghostMsg *leftMsg ;
182        ghostMsg *rightMsg; 
183        ghostMsg *topMsg ;
184        ghostMsg *bottomMsg;
185        ghostMsg *frontMsg ;
186        ghostMsg *backMsg ;
187      
188        
189       if(thisIndex.x-1 >= 0)
190       {
191           leftMsg = new (blockDimY*blockDimZ) ghostMsg(RIGHT, blockDimY, blockDimZ);
192           for(int j=0; j<blockDimY; ++j) 
193               for(int k=0; k<blockDimZ; ++k) {
194                   leftMsg->gh[k*blockDimY+j] = temperature[index(1, j+1, k+1)];
195               }
196           CkSetRefNum(leftMsg, iterations);
197           thisProxy(wrap_x(thisIndex.x-1), thisIndex.y, thisIndex.z).receiveGhosts(leftMsg);
198       }
199
200       if(thisIndex.x+1 <num_chare_x) 
201       {
202           rightMsg = new (blockDimY*blockDimZ) ghostMsg(LEFT, blockDimY, blockDimZ);
203           for(int j=0; j<blockDimY; ++j) 
204               for(int k=0; k<blockDimZ; ++k) {
205                   rightMsg->gh[k*blockDimY+j] = temperature[index(blockDimX, j+1, k+1)];}
206           CkSetRefNum(rightMsg, iterations);
207           thisProxy(wrap_x(thisIndex.x+1), thisIndex.y, thisIndex.z).receiveGhosts(rightMsg);
208       }
209
210       if(thisIndex.y-1 >= 0)
211       {
212           topMsg = new (blockDimX*blockDimZ) ghostMsg(BOTTOM, blockDimX, blockDimZ);
213           for(int i=0; i<blockDimX; ++i) 
214               for(int k=0; k<blockDimZ; ++k) {
215                   topMsg->gh[k*blockDimX+i] = temperature[index(i+1, 1, k+1)];
216               }
217
218           CkSetRefNum(topMsg, iterations);
219           thisProxy(thisIndex.x, wrap_y(thisIndex.y-1), thisIndex.z).receiveGhosts(topMsg);
220       }
221
222       if(thisIndex.y+1 <num_chare_y)
223       {
224           bottomMsg = new (blockDimX*blockDimZ) ghostMsg(TOP, blockDimX, blockDimZ);
225           for(int i=0; i<blockDimX; ++i) 
226           for(int k=0; k<blockDimZ; ++k) {
227               bottomMsg->gh[k*blockDimX+i] = temperature[index(i+1, blockDimY, k+1)];
228           }
229           CkSetRefNum(bottomMsg, iterations);
230           thisProxy(thisIndex.x, wrap_y(thisIndex.y+1), thisIndex.z).receiveGhosts(bottomMsg);
231       }
232
233       if(thisIndex.z-1 >= 0)
234       {
235           frontMsg = new (blockDimX*blockDimY) ghostMsg(BACK, blockDimX, blockDimY);
236           for(int i=0; i<blockDimX; ++i) 
237               for(int j=0; j<blockDimY; ++j) {
238                   frontMsg->gh[j*blockDimX+i] = temperature[index(i+1, j+1, 1)];
239               }
240           CkSetRefNum(frontMsg, iterations);
241           thisProxy(thisIndex.x, thisIndex.y, wrap_z(thisIndex.z-1)).receiveGhosts(frontMsg);
242       }
243       
244       if(thisIndex.z+1 <num_chare_z)
245       {
246           backMsg = new (blockDimX*blockDimY) ghostMsg(FRONT, blockDimX, blockDimY);
247           for(int i=0; i<blockDimX; ++i) 
248               for(int j=0; j<blockDimY; ++j) {
249                   backMsg->gh[j*blockDimX+i] = temperature[index(i+1, j+1, blockDimZ)];
250               }
251           CkSetRefNum(backMsg, iterations);
252           thisProxy(thisIndex.x, thisIndex.y, wrap_z(thisIndex.z+1)).receiveGhosts(backMsg);
253       }
254     }
255
256     void Jacobi::processGhosts(ghostMsg *gmsg) {
257       int height = gmsg->height;
258       int width = gmsg->width;
259       switch(gmsg->dir) {
260       case LEFT:
261           for(int j=0; j<height; ++j) 
262               for(int k=0; k<width; ++k) {
263                   temperature[index(0, j+1, k+1)] = gmsg->gh[k*height+j];
264               }
265           break;
266       case RIGHT:
267           for(int j=0; j<height; ++j) 
268               for(int k=0; k<width; ++k) {
269                   temperature[index(blockDimX+1, j+1, k+1)] = gmsg->gh[k*height+j];
270               }
271           break;
272       case TOP:
273           for(int i=0; i<height; ++i) 
274               for(int k=0; k<width; ++k) {
275                   temperature[index(i+1, 0, k+1)] = gmsg->gh[k*height+i];
276               }
277           break;
278       case BOTTOM:
279           for(int i=0; i<height; ++i) 
280               for(int k=0; k<width; ++k) {
281                   temperature[index(i+1, blockDimY+1, k+1)] = gmsg->gh[k*height+i];
282               }
283           break;
284       case FRONT:
285           for(int i=0; i<height; ++i) 
286               for(int j=0; j<width; ++j) {
287                   temperature[index(i+1, j+1,0)] = gmsg->gh[j*height+i];
288               }
289           break;
290       case BACK:
291           for(int i=0; i<height; ++i) 
292               for(int j=0; j<width; ++j) {
293                   temperature[index(i+1, j+1, blockDimZ+1)] = gmsg->gh[j*height+i];
294               }
295           break;
296       default:
297           CkAbort("ERROR\n");
298       }
299
300       delete gmsg;
301     }
302
303
304         void Jacobi::check_and_compute() {
305         double maxDiff;
306 #if CKLOOP
307         CkLoop_Parallelize(doCalc, 1,  (void*)this, 4*CkMyNodeSize(), 1, blockDimX, 1, &maxDiff, CKLOOP_DOUBLE_MAX);
308         CkLoop_Parallelize(doUpdate, 1,  (void*)this, CkMyNodeSize(), 1, blockDimX, 1);
309 #else
310                 maxDiff = compute_kernel();
311 #endif
312                 // calculate error
313                 // not being done right now since we are doing a fixed no. of iterations
314         //CkPrintf(" iteration %d [%d,%d,%d] max=%e\n", iterations, thisIndex.x, thisIndex.y, thisIndex.z, maxDiff);
315         if (iterations % CKP_FREQ == 0 || iterations > MAX_ITER){
316             CkCallback cb(CkReductionTarget(Main, converge), mainProxy);
317             contribute(sizeof(double), &maxDiff, CkReduction::max_double, cb);
318         } else {
319             CkCallback cb(CkReductionTarget(Main, converge), mainProxy);
320             contribute(sizeof(double), &maxDiff, CkReduction::max_double, cb);
321         }
322     }
323
324         void Jacobi::ResumeFromSync(){
325                 doStep();
326         }
327
328
329 #if JACOBI
330     double Jacobi::compute_kernel() {     //Gauss-Siedal compute
331         int i;
332
333         int x_c = arrayDimX/2/blockDimX;
334         int y_c = arrayDimY/2/blockDimY;
335         int z_c = arrayDimZ/2/blockDimZ;
336         int x_p = (arrayDimX/2)%blockDimX+1;
337         int y_p = (arrayDimY/2)%blockDimY+1;
338         int z_p = (arrayDimZ/2)%blockDimZ+1;
339 #ifdef JACOBI_OPENMP
340         double *maxDiffSub = new double[threadNums];
341         for(i=0; i<threadNums; i++)
342             maxDiffSub[i] = 0;
343 #endif
344         double maxDiff=0;
345   //#pragma omp parallel for shared(temperature, new_temperature, maxDiff) 
346   #pragma omp parallel  
347         {  
348         double t1 = CkWallTimer();
349         #pragma omp for 
350         for(i=1; i<blockDimX+1; ++i) {
351 #ifdef JACOBI_OPENMP
352             int tid = omp_get_thread_num();
353           //printf("[%d] did  %d iteration out of %d \n", omp_get_thread_num(), i, blockDimX+1); 
354 #endif
355           for(int j=1; j<blockDimY+1; ++j) {
356               for(int k=1; k<blockDimZ+1; ++k) {
357         
358                   if(thisIndex.x == x_c && y_c == thisIndex.y && z_c == thisIndex.z && i==x_p && j==y_p && k==z_p)
359                       continue;
360                   double update = 0;//temperature[index(i, j, k)];
361                   update += temperature[index(i+1, j, k)];
362                   update += temperature[index(i-1, j, k)];
363                   update += temperature[index(i, j+1, k)];
364                   update += temperature[index(i, j-1, k)];
365                   update += temperature[index(i, j, k+1)];
366                   update += temperature[index(i, j, k-1)];
367                   update /= 6;
368                  
369                   double diff = temperature[index(i, j, k)] - update;
370                   if(diff<0) diff = -1*diff;
371 #ifdef JACOBI_OPENMP
372                   if(diff>maxDiffSub[tid]) maxDiffSub[tid] = diff;
373 #else
374                   if(diff>maxDiff) maxDiff = diff;
375 #endif
376                   new_temperature[index(i, j, k)] = update;
377               }
378           }
379         }
380         //printf(" timecost [%d out of %d ]  %f\n", omp_get_thread_num(), omp_get_num_threads(), (CkWallTimer()-t1)*1e6);
381       }
382 #ifdef JACOBI_OPENMP
383         maxDiff= maxDiffSub[0];
384         for(i=1; i<threadNums; i++)
385             if(maxDiff < maxDiffSub[i]) maxDiff = maxDiffSub[i];
386 #endif
387   #pragma omp parallel  
388         {  
389         #pragma omp for 
390         for(i=1; i<blockDimX+1; ++i) {
391             for(int j=1; j<blockDimY+1; ++j) {
392                 for(int k=1; k<blockDimZ+1; ++k) {
393                   if(thisIndex.x == x_c && y_c == thisIndex.y && z_c == thisIndex.z && i==x_p && j==y_p && k==z_p)
394                       continue;
395                     temperature[index(i, j, k)] = new_temperature[index(i, j, k)];
396                 }
397             }
398         }
399       }
400 #if     PRINT_DEBUG
401         CkPrintf("\n {%d:%d:%d  %d:%d:%d  %d:%d:%d %f}", thisIndex_x, thisIndex_y, thisIndex_z, x_c, y_c, z_c, x_p, y_p, z_p, maxDiff);
402         for(i=1; i<blockDimX+1; ++i) {
403           for(int j=1; j<blockDimY+1; ++j) {
404               for(int k=1; k<blockDimZ+1; ++k) {
405                   CkPrintf("([%d:%d:%d]: %.2f )", i, j, k, temperature[index(i, j, k)]); 
406               }
407           }
408         }
409         CkPrintf("\n\n");
410 #endif
411         return maxDiff;
412     }
413
414 #else
415
416     // Check to see if we have received all neighbor values yet
417     // If all neighbor values have been received, we update our values and proceed
418     double Jacobi::compute_kernel() {     //Gauss-Siedal compute
419         int i=1;
420         double maxDiff=0;
421         double tmp, update=0;
422         int count=1;
423         int gg;
424
425         int x_c = arrayDimX/2/blockDimX;
426         int y_c = arrayDimY/2/blockDimY;
427         int z_c = arrayDimZ/2/blockDimZ;
428         int x_p = (arrayDimX/2)%blockDimX+1;
429         int y_p = (arrayDimY/2)%blockDimY+1;
430         int z_p = (arrayDimZ/2)%blockDimZ+1;
431             //CkPrintf("\n\n START %d [%d:%d:%d] \n", iterations, thisIndex.x, thisIndex.y, thisIndex.z);
432   #pragma omp parallel for schedule(dynamic) 
433         for(gg=0; gg<gaussIter; gg++){
434             maxDiff = 0;
435             for(i=1; i<blockDimX+1; ++i) {
436           //printf("[%d] did  %d iteration out of %d \n", omp_get_thread_num(), i, blockDimX+1); 
437           for(int j=1; j<blockDimY+1; ++j) {
438               for(int k=1; k<blockDimZ+1; ++k) {
439                   if(thisIndex.x == x_c && y_c == thisIndex.y && z_c == thisIndex.z && i==x_p && j==y_p && k==z_p)
440                       continue;
441                   update = 0;//temperature[index(i, j, k)];
442                   update += temperature[index(i+1, j, k)];
443                   update += temperature[index(i-1, j, k)];
444                   update += temperature[index(i, j+1, k)];
445                   update += temperature[index(i, j-1, k)];
446                   update += temperature[index(i, j, k+1)];
447                   update += temperature[index(i, j, k-1)];
448                   update /= 6;
449                  
450                   double diff = temperature[index(i, j, k)] - update;
451                   if(diff<0) diff = -1*diff;
452                   if(diff>maxDiff) maxDiff = diff;
453                   temperature[index(i, j, k)] = update;
454               }
455           }
456         }
457         }
458 #if     PRINT_DEBUG
459         CkPrintf("[%d:%d:%d] iteration=%d max:%.3f ==== ", thisIndex.x, thisIndex.y, thisIndex.z, iterations, maxDiff); 
460         for(i=0; i<blockDimX+2; ++i) {
461           for(int j=0; j<blockDimY+2; ++j) {
462               for(int k=0; k<blockDimZ+2; ++k) {
463                   CkPrintf("([%d:%d:%d]: %.2f )", i, j, k, temperature[index(i, j, k)]); 
464               }
465           }
466         }
467         CkPrintf("\n\n");
468 #endif
469         return maxDiff;
470     }
471 #endif
472     // Enforce some boundary conditions
473     void Jacobi::constrainBC() {
474       
475         // Heat right, left
476         if(thisIndex.x == 0 || thisIndex.x == num_chare_x-1)
477             for(int j=0; j<blockDimY+2; ++j)
478                 for(int k=0; k<blockDimZ+2; ++k)
479                 {   
480                     if(thisIndex.x == 0)
481                         temperature[index(0, j, k)] = LOW_VALUE;
482                     else
483                         temperature[index(blockDimX+1, j, k)] = LOW_VALUE;
484                 }
485         if(thisIndex.y == 0 || thisIndex.y == num_chare_y-1)
486             for(int j=0; j<blockDimX+2; ++j)
487                 for(int k=0; k<blockDimZ+2; ++k)
488                 {   
489                     if(thisIndex.y == 0)
490                         temperature[index(j,0, k)] = LOW_VALUE;
491                     else
492                         temperature[index(j, blockDimY+1, k)] = LOW_VALUE;
493                 }
494         if(thisIndex.z == 0 || thisIndex.z == num_chare_z-1)
495             for(int j=0; j<blockDimX+2; ++j)
496                 for(int k=0; k<blockDimY+2; ++k)
497                 {   
498                     if(thisIndex.z == 0)
499                         temperature[index(j, k, 0)] = LOW_VALUE;
500                     else
501                         temperature[index(j, k, blockDimZ+1)] = LOW_VALUE;
502                 }
503         int x_c = arrayDimX/2/blockDimX;
504         int y_c = arrayDimY/2/blockDimY;
505         int z_c = arrayDimZ/2/blockDimZ;
506         int x_p = (arrayDimX/2)%blockDimX+1;
507         int y_p = (arrayDimY/2)%blockDimY+1;
508         int z_p = (arrayDimZ/2)%blockDimZ+1;
509         if(thisIndex.x == x_c && y_c == thisIndex.y && z_c == thisIndex.z)
510         {
511             temperature[index(x_p, y_p, z_p)] = HIGH_VALUE;
512         }
513
514     }
515
516     void Jacobi::print()
517     {
518         CkPrintf(" print %d:%d:%d ", thisIndex.x, thisIndex.y, thisIndex.z); 
519           //printf("[%d] did  %d iteration out of %d \n", omp_get_thread_num(), i, blockDimX+1); 
520           for(int i=1; i<blockDimX+1;++i){CkPrintf("==\n");
521               for(int j=1;j<blockDimY+1;++j)
522               for(int k=1; k<blockDimZ+1; ++k) {
523                   CkPrintf(" %e ",  temperature[index(i, j, k)]) ;
524               }
525               CkPrintf("------\n-----\n");
526           }
527               //contribute(0, 0, CkReduction::concat, CkCallback(CkIndex_Main::report(), mainProxy));
528     }
529
530 class JacobiNodeMap : public CkArrayMap {
531
532 public:
533     JacobiNodeMap() {}
534     ~JacobiNodeMap() { 
535     }
536
537     int procNum(int, const CkArrayIndex &idx) {
538       int *index = (int *)idx.data();
539       return (CkMyNodeSize() * (index[0]*num_chare_x*num_chare_y + index[1]*num_chare_y + index[2]))%CkNumPes(); 
540     }
541
542 };
543
544 class JacobiMap : public CkArrayMap {
545   public:
546     int X, Y, Z;
547     int *mapping;
548
549     JacobiMap(int x, int y, int z) {
550       X = x; Y = y; Z = z;
551       mapping = new int[X*Y*Z];
552
553       // we are assuming that the no. of chares in each dimension is a 
554       // multiple of the torus dimension
555
556       TopoManager tmgr;
557       int dimNX, dimNY, dimNZ, dimNT;
558
559       dimNX = tmgr.getDimNX();
560       dimNY = tmgr.getDimNY();
561       dimNZ = tmgr.getDimNZ();
562       dimNT = tmgr.getDimNT();
563
564       // we are assuming that the no. of chares in each dimension is a 
565       // multiple of the torus dimension
566       int numCharesPerPe = X*Y*Z/CkNumPes();
567
568       int numCharesPerPeX = X / dimNX;
569       int numCharesPerPeY = Y / dimNY;
570       int numCharesPerPeZ = Z / dimNZ;
571       int pe = 0, pes = CkNumPes();
572
573 #if USE_BLOCK_RNDMAP
574       int used[pes];
575       for(int i=0; i<pes; i++)
576         used[i] = 0;
577 #endif
578
579       if(dimNT < 2) {   // one core per node
580           if(CkMyPe()==0) CkPrintf("%d %d %d %d : %d %d %d \n", dimNX, dimNY, dimNZ, dimNT, numCharesPerPeX, numCharesPerPeY, numCharesPerPeZ); 
581           for(int i=0; i<dimNX; i++)
582               for(int j=0; j<dimNY; j++)
583                   for(int k=0; k<dimNZ; k++)
584                   {
585 #if USE_BLOCK_RNDMAP
586                       pe = myrand(pes); 
587                       while(used[pe]!=0) {
588                           pe = myrand(pes); 
589                       }
590                       used[pe] = 1;
591 #endif
592
593                       for(int ci=i*numCharesPerPeX; ci<(i+1)*numCharesPerPeX; ci++)
594                           for(int cj=j*numCharesPerPeY; cj<(j+1)*numCharesPerPeY; cj++)
595                               for(int ck=k*numCharesPerPeZ; ck<(k+1)*numCharesPerPeZ; ck++) {
596 #if USE_TOPOMAP
597                                   mapping[ci*Y*Z + cj*Z + ck] = tmgr.coordinatesToRank(i, j, k);
598 #elif USE_BLOCK_RNDMAP
599                                   mapping[ci*Y*Z + cj*Z + ck] = pe;
600 #endif
601                               }
602                   }
603       } else {          // multiple cores per node
604           // In this case, we split the chares in the X dimension among the
605           // // cores on the same node. The strange thing I figured out is that
606           // // doing this in the Z dimension is not as good.
607           // numCharesPerPeX /= dimNT;
608           if(CkMyPe()==0) CkPrintf("%d %d %d %d : %d %d %d \n", dimNX, dimNY, dimNZ, dimNT, numCharesPerPeX, numCharesPerPeY, numCharesPerPeZ);
609
610           for(int i=0; i<dimNX; i++)
611               for(int j=0; j<dimNY; j++)
612                   for(int k=0; k<dimNZ; k++)
613                       for(int l=0; l<dimNT; l++)
614                           for(int ci=(dimNT*i+l)*numCharesPerPeX; ci<(dimNT*i+l+1)*numCharesPerPeX; ci++)
615                               for(int cj=j*numCharesPerPeY; cj<(j+1)*numCharesPerPeY; cj++)
616                                   for(int ck=k*numCharesPerPeZ; ck<(k+1)*numCharesPerPeZ; ck++) {
617                                       mapping[ci*Y*Z + cj*Z + ck] = tmgr.coordinatesToRank(i, j, k, l);
618                                   }
619       } // end of if
620
621       if(CkMyPe() == 0) CkPrintf("Map generated ... \n");
622     }
623
624     ~JacobiMap() { 
625         delete [] mapping;
626     }
627
628     int procNum(int, const CkArrayIndex &idx) {
629         int *index = (int *)idx.data();
630         return mapping[index[0]*Y*Z + index[1]*Z + index[2]]; 
631     }
632 };
633
634 class TraceControl : public Group 
635 {
636 public:
637     TraceControl() { omp_set_num_threads(threadNums);}
638
639     void startTrace() { traceBegin(); }
640
641     void endTrace() { traceEnd(); }
642 };
643