f6962b0b02974e45933155f0bf7eb6fc8281a6df
[charm.git] / examples / charm++ / ckdirect / stencil3d / stencil3d.C
1 #include "ckdirect.h" 
2
3 #if defined ARR_CHECK && !defined CMK_PARANOID
4 #define CMK_PARANOID
5 #endif
6
7 #include "stencil3d.decl.h"
8 #define OOB 9999999999.0
9 #define NBRS 6
10
11 CProxy_Main mainProxy;
12
13 int dimx, dimy, dimz;
14 int charesx, charesy, charesz;
15 int num_iterations;
16
17 class Main : public CBase_Main {
18   
19   double startSetup, start, end;
20   public:
21
22   Main(CkArgMsg *msg){
23     mainProxy = thisProxy;
24
25     if(msg->argc == 8){
26       dimx = atoi(msg->argv[1]);
27       dimy = atoi(msg->argv[2]);
28       dimz = atoi(msg->argv[3]);
29       charesx = atoi(msg->argv[4]);
30       charesy = atoi(msg->argv[5]);
31       charesz = atoi(msg->argv[6]);
32       num_iterations = atoi(msg->argv[7]);
33     }
34     else{
35       CkAbort("arguments: dimx dimy dimz charesx charesy charesz iter\n");
36     }
37     
38     delete msg;
39     CProxy_StencilPoint array = 
40         CProxy_StencilPoint::ckNew(charesx*charesy*charesz);
41     CkPrintf("main: dim: %d,%d,%d charedim: %d,%d,%d num_iterations: %d\n", dimx, dimy, dimz, charesx, charesy, charesz, num_iterations);
42     startSetup = CmiWallTimer();
43 #ifdef USE_CKDIRECT
44     array.setupChannels();
45 #else
46     array.sendData();
47 #endif
48   }
49
50   void doneSetup(){
51     start = CmiWallTimer();
52   }
53   
54   void done(CkReductionMsg *msg){
55     end = CmiWallTimer();
56     CkPrintf("Total time: %f sec\n", end-startSetup);
57     CkPrintf("Computation time per iteration: %f sec\n", (end-start)/(num_iterations-1));
58     CkPrintf("Total computations: %f\n", *(double *)msg->getData());
59     CkExit();
60   }
61 };
62
63 class StencilMsg : public CMessage_StencilMsg {
64   public:
65   float *arr;
66   int size;
67   int which;
68 };
69
70 class StencilPoint : public CBase_StencilPoint{
71   // this chare's place in the array of chares
72   int row, col, plane;
73   int whichLocal;
74
75   // this chare has a domain with dimensions 
76   int rows, cols, planes;
77   int payload[3];
78   
79   int iterations;
80   
81     /* Number of elements whose new values have been computed
82      */
83     double eltsComp;
84 //#ifdef USE_CKDIRECT
85   /* 0: left, 1: right, 2: top, 3: bottom*/
86   infiDirectUserHandle shandles[2][NBRS];
87   infiDirectUserHandle rhandles[NBRS];
88
89 //#endif
90 #ifdef ARR_CHECK
91 #ifdef USE_MESSAGES
92   float *recvBuf[NBRS];
93 #else
94   CkVec<float> recvBuf[NBRS];
95 #endif
96   CkVec<float> sendBuf[2][NBRS];
97   CkVec<float> localChunk[2];
98 #else
99   /* send buffers */
100   float *sendBuf[2][NBRS]; /* rectangle */
101   /* receive buffers */
102   float *recvBuf[NBRS]; /* rectangle */
103   float *localChunk[2]; /* cuboid */
104 #endif
105 #ifdef USE_MESSAGES
106   StencilMsg *recvMsgs[NBRS];
107 #endif
108
109   int remainingBufs;
110   int remainingChannels;
111   
112   public:
113   StencilPoint(CkMigrateMessage *){}
114   StencilPoint(){
115     remainingBufs = NBRS;
116     remainingChannels = NBRS;
117     whichLocal= 0;
118
119     iterations = 0;
120     eltsComp = 0;
121     
122     plane = thisIndex/(charesx*charesy);
123     row = (thisIndex/charesx)%charesy;
124     col = thisIndex%charesx;
125     
126     rows = dimy/charesy;
127     cols = dimx/charesx;
128     planes = dimz/charesz;
129
130     payload[0] = rows*planes; // x
131     payload[1] = cols*planes; // y
132     payload[2] = rows*cols;   // z
133     
134     if(thisIndex == 0){
135       CkPrintf("rows: %d cols: %d planes: %d\n", rows, cols, planes);
136     }
137     // allocate memory
138 #ifdef ARR_CHECK
139     sendBuf[0][0].resize(payload[0]);
140     sendBuf[1][0].resize(payload[0]);
141     sendBuf[0][1].resize(payload[0]);
142     sendBuf[1][1].resize(payload[0]);
143
144     sendBuf[0][2].resize(payload[1]);
145     sendBuf[1][2].resize(payload[1]);
146     sendBuf[0][3].resize(payload[1]);
147     sendBuf[1][3].resize(payload[1]);
148     
149     sendBuf[0][4].resize(payload[2]);
150     sendBuf[1][4].resize(payload[2]);
151     sendBuf[0][5].resize(payload[2]);
152     sendBuf[1][5].resize(payload[2]);
153 #else
154     sendBuf[0][0] = new float [payload[0]];
155     sendBuf[1][0] = new float [payload[0]];
156     sendBuf[0][1] = new float [payload[0]];
157     sendBuf[1][1] = new float [payload[0]];
158
159     sendBuf[0][2] = new float [payload[1]];
160     sendBuf[1][2] = new float [payload[1]];
161     sendBuf[0][3] = new float [payload[1]];
162     sendBuf[1][3] = new float [payload[1]];
163
164     sendBuf[0][4] = new float [payload[2]];
165     sendBuf[1][4] = new float [payload[2]];
166     sendBuf[0][5] = new float [payload[2]];
167     sendBuf[1][5] = new float [payload[2]];
168 #endif
169
170 #ifndef USE_MESSAGES
171 #ifdef ARR_CHECK
172     recvBuf[0].resize(payload[0]);
173     recvBuf[1].resize(payload[0]);
174     recvBuf[2].resize(payload[1]);
175     recvBuf[3].resize(payload[1]);
176     recvBuf[4].resize(payload[2]);
177     recvBuf[5].resize(payload[2]);
178 #else
179     recvBuf[0] = new float[payload[0]];
180     recvBuf[1] = new float[payload[0]];
181     recvBuf[2] = new float[payload[1]];
182     recvBuf[3] = new float[payload[1]];
183     recvBuf[4] = new float[payload[2]];
184     recvBuf[5] = new float[payload[2]];
185 #endif
186 #else
187     for(int i = 0; i < NBRS; i++)
188       recvBuf[i] = 0;
189 #endif
190     
191 #ifdef ARR_CHECK
192     localChunk[0].resize((rows-2)*(cols-2)*(planes-2));
193     localChunk[1].resize((rows-2)*(cols-2)*(planes-2));
194 #else
195     localChunk[0] = new float[(rows-2)*(cols-2)*(planes-2)];
196     localChunk[1] = new float[(rows-2)*(cols-2)*(planes-2)];
197 #endif
198     // initialize
199     if(plane == 0){// face is held at 0
200       for(int i = 0; i < rows*cols; i++)
201         // first index says which version of the double buffer is to be used
202         // second index, the face in question
203         sendBuf[0][5][i] = 1.0;
204     }
205     else{
206       for(int i = 0; i < rows*cols; i++)
207         // first index says which version of the double buffer is to be used
208         // second index, the face in question
209         sendBuf[0][5][i] = 0.0;
210     }
211
212     for(int i = 0; i < rows*cols; i++)
213       sendBuf[0][4][i] = 0.0;
214       
215     for(int i = 0; i < rows*planes; i++)
216       sendBuf[0][1][i] = 0.0;
217       
218     for(int i = 0; i < rows*planes; i++)
219       sendBuf[0][0][i] = 0.0;
220       
221     for(int i = 0; i < cols*planes; i++)
222       sendBuf[0][3][i] = 0.0;
223       
224     for(int i = 0; i < cols*planes; i++)
225       sendBuf[0][2][i] = 0.0;
226       
227     
228     // now the rest of the cube
229     for(int j = 0; j < (rows-2)*(cols-2)*(planes-2); j++){
230       localChunk[0][j] = 0.0; 
231     }
232   }
233   
234   ~StencilPoint(){
235     for(int i = 0; i < NBRS; i++){
236 #ifdef ARR_CHECK
237       sendBuf[0][i].free();
238       sendBuf[1][i].free();
239 #else
240       delete [] sendBuf[0][i];
241       delete [] sendBuf[1][i];
242 #endif
243 #ifndef USE_MESSAGES
244 #ifdef ARR_CHECK
245       recvBuf[i].free();
246 #else
247       delete [] recvBuf[i];
248 #endif
249 #endif
250     }
251
252 #ifdef ARR_CHECK
253     localChunk[0].free();
254     localChunk[1].free();
255 #else
256     delete [] localChunk[0];
257     delete [] localChunk[1];
258 #endif
259   }
260
261 #define lin(c,r,p) ((p)*charesx*charesy+(r)*charesx+(c))
262 //#ifdef USE_CKDIRECT
263   void setupChannels(){
264     int node = CkMyNode();
265 #ifdef STENCIL2D_VERBOSE
266     /*
267     CkPrintf("(%d,%d): notify (%d,%d):l, (%d,%d):r, (%d,%d):t, (%d,%d):b\n", 
268               row, col,
269               row, (col+1)%num_cols,
270               row, (col-1+num_cols)%num_cols,
271               (row+1)%num_rows, col,
272               (row-1+num_rows)%num_rows, col);
273     */
274 #endif
275
276     thisProxy[lin((col+1)%charesx, (row), (plane))].notify(node,0,thisIndex);
277     thisProxy[lin((col-1+charesx)%charesx, (row), (plane))].notify(node,1,thisIndex);
278     thisProxy[lin((col), (row+1)%charesy, (plane))].notify(node,2,thisIndex);
279     thisProxy[lin((col), (row-1+charesy)%charesy, (plane))].notify(node,3,thisIndex);
280     thisProxy[lin((col), (row), (plane+1)%charesz)].notify(node,4,thisIndex);
281     thisProxy[lin((col), (row), (plane-1+charesz)%charesz)].notify(node,5,thisIndex);
282   }
283
284   void notify(int node, int which, CkIndex1D whoSent){
285     // make handle
286 #ifdef STENCIL2D_VERBOSE
287     CkPrintf("(%d,%d,%d): (%d) is %d to me\n", row, col, plane, whoSent, which);
288 #endif
289 #if defined ARR_CHECK && !defined USE_MESSAGES
290     rhandles[which] = CkDirect_createHandle(node, recvBuf[which].getVec(), payload[which/2]*sizeof(float), StencilPoint::callbackWrapper, (void *)this, OOB);
291 #else
292     rhandles[which] = CkDirect_createHandle(node, recvBuf[which], payload[which/2]*sizeof(float), StencilPoint::callbackWrapper, (void *)this, OOB);
293 #endif
294     thisProxy[whoSent].recvHandle(rhandles[which], which);
295   }
296
297   void recvHandle(infiDirectUserHandle handle, int which){
298     // this causes an inversion of the buffer indices:
299     // left send buffer is 1, right 0, top 3, bottom 2
300     shandles[0][which] = handle;
301     shandles[1][which] = handle;
302     
303 #ifdef ARR_CHECK
304     CkDirect_assocLocalBuffer(&shandles[0][which], sendBuf[0][which].getVec(), payload[which/2]*sizeof(float));
305     CkDirect_assocLocalBuffer(&shandles[1][which], sendBuf[1][which].getVec(), payload[which/2]*sizeof(float));
306 #else
307     CkDirect_assocLocalBuffer(&shandles[0][which], sendBuf[0][which], payload[which/2]*sizeof(float));
308     CkDirect_assocLocalBuffer(&shandles[1][which], sendBuf[1][which], payload[which/2]*sizeof(float));
309 #endif
310     remainingChannels--;
311     if(remainingChannels == 0){
312       // start 
313 #ifdef STENCIL2D_VERBOSE
314       //CkPrintf("(%d,%d,%d): recvd all handles, start put\n", x,y,z);
315 #endif
316       sendData();
317     }
318   }
319
320   void recvBuffer(){
321     remainingBufs--;
322 #ifdef STENCIL2D_VERBOSE
323     //CkPrintf("(%d,%d,%d): remainingBufs: %d\n", x,y,z,remainingBufs);
324 #endif
325     if(remainingBufs == 0){
326 #ifdef STENCIL2D_VERBOSE
327       //CkPrintf("(%d,%d,%d): recvd all buffers, start compute(%d)\n", x,y,z, iterations);
328 #endif
329       remainingBufs = NBRS;
330       thisProxy[thisIndex].compute();
331     }
332   }
333   
334   static void callbackWrapper(void *arg){
335     StencilPoint *obj = (StencilPoint *)arg;
336     obj->recvBuffer();
337   }
338 //#else   // don't USE_CKDIRECT
339   void recvBuffer(float *array, int size, int whichBuf){
340     remainingBufs--;
341 #if defined ARR_CHECK && !defined USE_MESSAGES
342     memcpy(recvBuf[whichBuf].getVec(), array, size*sizeof(float));
343 #else
344     memcpy(recvBuf[whichBuf], array, size*sizeof(float));
345 #endif
346     /*
347     for(int i = 0; i < size; i++){
348       recvBuf[whichBuf][i] = array[i];
349     }
350     */
351     
352     if(remainingBufs == 0){
353       remainingBufs = NBRS;
354       compute();
355     }
356   }
357
358   void recvBufferMsg(StencilMsg *msg){
359 #ifdef USE_MESSAGES
360     remainingBufs--;
361     recvBuf[msg->which] = msg->arr;
362     recvMsgs[msg->which] = msg;
363     
364     if(remainingBufs == 0){
365       remainingBufs = NBRS;
366       compute();
367       for(int i = 0; i < NBRS; i++){
368         delete recvMsgs[i];
369         recvMsgs[i] = 0;
370       }
371     }
372  
373 #else
374       CmiAbort("Messages not used, don't call\n");
375 #endif
376       
377   }
378 //#endif // end ifdef USE_CKDIRECT
379   
380
381 // to access localChunk interior (i.e. interior of interior) 
382 #define small(r,c,p) (p)*(rows-2)*(cols-2)+(r)*(cols-2)+(c)
383 // to access sendBufs 
384 #define facex(r,c) (r)*planes+(c)
385 #define facey(r,c) (r)*planes+(c)
386 #define facez(r,c) (r)*cols+(c)
387 // indices into sendBufs, after inversion
388 #define xp 1
389 #define xn 0
390 #define yp 3
391 #define yn 2
392 #define zp 5
393 #define zn 4
394
395 // indices into recvBufs, without inversion
396 #define XP 0
397 #define XN 1
398 #define YP 2
399 #define YN 3
400 #define ZP 4
401 #define ZN 5
402
403   void compute(){
404     // 1. do actual work
405     int newLocal= 1-whichLocal;
406     
407     // interior of interior: uses only localChunk
408     for(int k = 1; k < planes-3; k++){
409       for(int i = 1; i < rows-3; i++){
410         for(int j = 1; j < cols-3; j++){
411           localChunk[newLocal][small(i,j,k)] = (
412                               localChunk[whichLocal][small(i,j,k)]+
413                               localChunk[whichLocal][small(i+1,j,k)]+
414                               localChunk[whichLocal][small(i-1,j,k)]+
415                               localChunk[whichLocal][small(i,j+1,k)]+
416                               localChunk[whichLocal][small(i,j-1,k)]+
417                               localChunk[whichLocal][small(i,j,k+1)]+
418                               localChunk[whichLocal][small(i,j,k-1)]
419                             )/7;
420           eltsComp+=1;
421         }
422       }
423     }
424
425     // 8 corners of interior: uses localChunk and sendBuf's
426     //0.
427     localChunk[newLocal][small(0,0,0)] = (
428                             localChunk[whichLocal][small(0,0,0)]+
429                             localChunk[whichLocal][small(1,0,0)]+
430                             localChunk[whichLocal][small(0,1,0)]+
431                             localChunk[whichLocal][small(0,0,1)]+
432                             sendBuf[whichLocal][zp][facez(1,1)]+
433                             sendBuf[whichLocal][xp][facex(1,1)]+
434                             sendBuf[whichLocal][yp][facey(1,1)]
435                             )/7;
436
437     //1.
438     localChunk[newLocal][small(0,cols-3,0)] = (
439                             localChunk[whichLocal][small(0,cols-3,0)]+
440                             localChunk[whichLocal][small(0,cols-4,0)]+
441                             localChunk[whichLocal][small(1,cols-3,0)]+
442                             localChunk[whichLocal][small(0,cols-3,1)]+
443                             sendBuf[whichLocal][zp][facez(1,cols-2)]+
444                             sendBuf[whichLocal][yp][facey(cols-2,1)]+
445                             sendBuf[whichLocal][xn][facex(1,1)]
446                             )/7;
447
448     //2.
449     localChunk[newLocal][small(rows-3,0,0)] = (
450                             localChunk[whichLocal][small(rows-3,0,0)]+
451                             localChunk[whichLocal][small(rows-4,0,0)]+
452                             localChunk[whichLocal][small(rows-3,1,0)]+
453                             localChunk[whichLocal][small(rows-3,0,1)]+
454                             sendBuf[whichLocal][zp][facez(rows-2,1)]+
455                             sendBuf[whichLocal][yn][facey(1,1)]+
456                             sendBuf[whichLocal][xp][facex(rows-2,1)]
457                             )/7;
458
459     //3.
460     localChunk[newLocal][small(rows-3,cols-3,0)] = (
461                             localChunk[whichLocal][small(rows-3,cols-3,0)]+
462                             localChunk[whichLocal][small(rows-3,cols-4,0)]+
463                             localChunk[whichLocal][small(rows-4,cols-3,0)]+
464                             localChunk[whichLocal][small(rows-3,cols-3,1)]+
465                             sendBuf[whichLocal][zp][facez(rows-2,cols-2)]+
466                             sendBuf[whichLocal][yn][facey(cols-2,1)]+
467                             sendBuf[whichLocal][xn][facex(rows-2,1)]
468                             )/7;
469     
470     //4.
471     localChunk[newLocal][small(0,0,planes-3)] = (
472                             localChunk[whichLocal][small(0,0,planes-3)]+
473                             localChunk[whichLocal][small(0,0,planes-4)]+
474                             localChunk[whichLocal][small(0,1,planes-3)]+
475                             localChunk[whichLocal][small(1,0,planes-3)]+
476                             sendBuf[whichLocal][zn][facez(1,1)]+
477                             sendBuf[whichLocal][yp][facey(1,planes-2)]+
478                             sendBuf[whichLocal][xp][facex(1,planes-2)]
479                             )/7;
480
481     //5.
482     localChunk[newLocal][small(0,cols-3,planes-3)] = (
483                             localChunk[whichLocal][small(0,cols-3,planes-3)]+
484                             localChunk[whichLocal][small(0,cols-4,planes-3)]+
485                             localChunk[whichLocal][small(1,cols-3,planes-3)]+
486                             localChunk[whichLocal][small(0,cols-3,planes-4)]+
487                             sendBuf[whichLocal][zn][facez(1,cols-2)]+
488                             sendBuf[whichLocal][yp][facey(cols-2,planes-2)]+
489                             sendBuf[whichLocal][xn][facex(1,planes-2)]
490                             )/7;
491     
492     //6.
493     localChunk[newLocal][small(rows-3,0,planes-3)] = (
494                             localChunk[whichLocal][small(rows-3,0,planes-3)]+
495                             localChunk[whichLocal][small(rows-4,0,planes-3)]+
496                             localChunk[whichLocal][small(rows-3,1,planes-3)]+
497                             localChunk[whichLocal][small(rows-3,0,planes-4)]+
498                             sendBuf[whichLocal][zn][facez(rows-2,1)]+
499                             sendBuf[whichLocal][yn][facey(1,planes-2)]+
500                             sendBuf[whichLocal][xp][facex(rows-2,planes-2)]
501                             )/7;
502
503     //7.
504     localChunk[newLocal][small(rows-3,cols-3,planes-3)] = (
505                             localChunk[whichLocal][small(rows-3,cols-3,planes-3)]+
506                             localChunk[whichLocal][small(rows-4,cols-3,planes-3)]+
507                             localChunk[whichLocal][small(rows-3,cols-4,planes-3)]+
508                             localChunk[whichLocal][small(rows-3,cols-3,planes-4)]+
509                             sendBuf[whichLocal][zn][facez(rows-2,cols-2)]+
510                             sendBuf[whichLocal][yn][facey(cols-2,planes-2)]+
511                             sendBuf[whichLocal][xn][facex(rows-2,planes-2)]
512                             )/7;
513
514    eltsComp += 8;
515
516     // 12 edges
517     //zp,yp
518     for(int i = 1; i < cols-3; i++){
519       localChunk[newLocal][small(0,i,0)] = (
520                 localChunk[whichLocal][small(0,i,0)]+
521                 localChunk[whichLocal][small(0,i-1,0)]+
522                 localChunk[whichLocal][small(0,i+1,0)]+
523                 localChunk[whichLocal][small(1,i,0)]+
524                 localChunk[whichLocal][small(0,i,1)]+
525                 sendBuf[whichLocal][yp][facey(i+1,1)]+
526                 sendBuf[whichLocal][zp][facez(1,i+1)]
527           )/7;
528        eltsComp+=1;
529     }
530
531     //xn,yp
532     for(int i = 1; i < planes-3; i++){
533       localChunk[newLocal][small(0,cols-3,i)] = (
534                 localChunk[whichLocal][small(0,cols-3,i)]+
535                 localChunk[whichLocal][small(0,cols-3,i-1)]+
536                 localChunk[whichLocal][small(0,cols-3,i+1)]+
537                 localChunk[whichLocal][small(1,cols-3,i)]+
538                 localChunk[whichLocal][small(0,cols-4,i)]+
539                 sendBuf[whichLocal][xn][facex(1,i+1)]+
540                 sendBuf[whichLocal][yp][facey(cols-2,i+1)]
541           )/7;
542        eltsComp+=1;
543     }
544
545     //zn,yp
546     for(int i = 1; i < cols-3; i++){
547       localChunk[newLocal][small(0,i,planes-3)] = (
548                 localChunk[whichLocal][small(0,i,planes-3)]+
549                 localChunk[whichLocal][small(0,i-1,planes-3)]+
550                 localChunk[whichLocal][small(0,i+1,planes-3)]+
551                 localChunk[whichLocal][small(0,i,planes-4)]+
552                 localChunk[whichLocal][small(1,i,planes-3)]+
553                 sendBuf[whichLocal][zn][facez(1,i+1)]+
554                 sendBuf[whichLocal][yp][facey(i+1,planes-2)]
555           )/7;
556        eltsComp+=1;
557     }
558
559     //yp,xp
560     for(int i = 1; i < planes-3; i++){
561       localChunk[newLocal][small(0,0,i)] = (
562                 localChunk[whichLocal][small(0,0,i)]+
563                 localChunk[whichLocal][small(0,0,i-1)]+
564                 localChunk[whichLocal][small(0,0,i+1)]+
565                 localChunk[whichLocal][small(0,1,i)]+
566                 localChunk[whichLocal][small(1,0,i)]+
567                 sendBuf[whichLocal][yp][facey(1,i+1)]+
568                 sendBuf[whichLocal][xp][facex(1,i+1)]
569           )/7;
570        eltsComp+=1;
571     }
572
573     //yn,zp
574     for(int i = 1; i < cols-3; i++){
575       localChunk[newLocal][small(rows-3,i,0)] = (
576                 localChunk[whichLocal][small(rows-3,i,0)]+
577                 localChunk[whichLocal][small(rows-3,i-1,0)]+
578                 localChunk[whichLocal][small(rows-3,i+1,0)]+
579                 localChunk[whichLocal][small(rows-4,i,0)]+
580                 localChunk[whichLocal][small(rows-3,i,1)]+
581                 sendBuf[whichLocal][yn][facey(i+1,1)]+
582                 sendBuf[whichLocal][zp][facez(rows-2,i+1)]
583           )/7;
584        eltsComp+=1;
585     }
586
587     //xn,yn
588     for(int i = 1; i < planes-3; i++){
589       localChunk[newLocal][small(rows-3,cols-3,i)] = (
590                 localChunk[whichLocal][small(rows-3,cols-3,i)]+
591                 localChunk[whichLocal][small(rows-3,cols-3,i-1)]+
592                 localChunk[whichLocal][small(rows-3,cols-3,i+1)]+
593                 localChunk[whichLocal][small(rows-4,cols-3,i)]+
594                 localChunk[whichLocal][small(rows-3,cols-4,i)]+
595                 sendBuf[whichLocal][xn][facex(rows-2,i+1)]+
596                 sendBuf[whichLocal][yn][facey(cols-2,i+1)]
597           )/7;
598        eltsComp+=1;
599     }
600
601     //yn,zn
602     for(int i = 1; i < cols-3; i++){
603       localChunk[newLocal][small(rows-3,i,planes-3)] = (
604                 localChunk[whichLocal][small(rows-3,i,planes-3)]+
605                 localChunk[whichLocal][small(rows-3,i-1,planes-3)]+
606                 localChunk[whichLocal][small(rows-3,i+1,planes-3)]+
607                 localChunk[whichLocal][small(rows-4,i,planes-3)]+
608                 localChunk[whichLocal][small(rows-3,i,planes-4)]+
609                 sendBuf[whichLocal][yn][facey(i+1,planes-2)]+
610                 sendBuf[whichLocal][zn][facez(rows-2,i+1)]
611           )/7;
612        eltsComp+=1;
613     }
614
615     //xp,yn
616     for(int i = 1; i < planes-3; i++){
617       localChunk[newLocal][small(rows-3,0,i)] = (
618                 localChunk[whichLocal][small(rows-3,0,i)]+
619                 localChunk[whichLocal][small(rows-3,0,i-1)]+
620                 localChunk[whichLocal][small(rows-3,0,i+1)]+
621                 localChunk[whichLocal][small(rows-3,1,i)]+
622                 localChunk[whichLocal][small(rows-4,0,i)]+
623                 sendBuf[whichLocal][xp][facex(rows-2,i+1)]+
624                 sendBuf[whichLocal][yn][facey(1,i+1)]
625           )/7;
626        eltsComp+=1;
627     }
628
629     //xp,zp
630     for(int i = 1; i < rows-3; i++){
631       localChunk[newLocal][small(i,0,0)] = (
632                 localChunk[whichLocal][small(i,0,0)]+
633                 localChunk[whichLocal][small(i-1,0,0)]+
634                 localChunk[whichLocal][small(i+1,0,0)]+
635                 localChunk[whichLocal][small(i,1,0)]+
636                 localChunk[whichLocal][small(i,0,1)]+
637                 sendBuf[whichLocal][xp][facex(i+1,1)]+
638                 sendBuf[whichLocal][zp][facez(i+1,1)]
639           )/7;
640        eltsComp+=1;
641     }
642
643     //xn,zp
644     for(int i = 1; i < rows-3; i++){
645       localChunk[newLocal][small(i,cols-3,0)] = (
646                 localChunk[whichLocal][small(i,cols-3,0)]+
647                 localChunk[whichLocal][small(i-1,cols-3,0)]+
648                 localChunk[whichLocal][small(i+1,cols-3,0)]+
649                 localChunk[whichLocal][small(i,cols-4,0)]+
650                 localChunk[whichLocal][small(i,cols-3,1)]+
651                 sendBuf[whichLocal][xn][facex(i+1,1)]+
652                 sendBuf[whichLocal][zp][facez(i+1,cols-2)]
653           )/7;
654        eltsComp+=1;
655     }
656
657     //xp,zn
658     for(int i = 1; i < rows-3; i++){
659       localChunk[newLocal][small(i,0,planes-3)] = (
660                 localChunk[whichLocal][small(i,0,planes-3)]+
661                 localChunk[whichLocal][small(i-1,0,planes-3)]+
662                 localChunk[whichLocal][small(i+1,0,planes-3)]+
663                 localChunk[whichLocal][small(i,1,planes-3)]+
664                 localChunk[whichLocal][small(i,0,planes-4)]+
665                 sendBuf[whichLocal][xp][facex(i+1,planes-2)]+
666                 sendBuf[whichLocal][zn][facez(i+1,1)]
667           )/7;
668        eltsComp+=1;
669     }
670
671     //xn,zn
672     for(int i = 1; i < rows-3; i++){
673       localChunk[newLocal][small(i,cols-3,planes-3)] = (
674                 localChunk[whichLocal][small(i,cols-3,planes-3)]+
675                 localChunk[whichLocal][small(i-1,cols-3,planes-3)]+
676                 localChunk[whichLocal][small(i+1,cols-3,planes-3)]+
677                 localChunk[whichLocal][small(i,cols-4,planes-3)]+
678                 localChunk[whichLocal][small(i,cols-3,planes-4)]+
679                 sendBuf[whichLocal][xn][facex(i+1,planes-2)]+
680                 sendBuf[whichLocal][zn][facez(i+1,cols-2)]
681           )/7;
682        eltsComp+=1;
683
684     }
685
686     // 6 more faces - use 6 (including self) from localChunk and 1 from one of the sendBufs
687     for(int i = 1; i < rows-3; i++){
688       for(int j = 1; j < cols-3; j++){
689         localChunk[newLocal][small(i,j,0)] = (
690             localChunk[whichLocal][small(i,j,0)]+
691             localChunk[whichLocal][small(i-1,j,0)]+
692             localChunk[whichLocal][small(i+1,j,0)]+
693             localChunk[whichLocal][small(i,j-1,0)]+
694             localChunk[whichLocal][small(i,j+1,0)]+
695             localChunk[whichLocal][small(i,j,1)]+
696             sendBuf[whichLocal][zp][facez(i+1,j+1)]
697         )/7;
698        eltsComp+=1;
699       }
700     }
701     for(int i = 1; i < rows-3; i++){
702       for(int j = 1; j < cols-3; j++){
703         localChunk[newLocal][small(i,j,planes-3)] = (
704             localChunk[whichLocal][small(i,j,planes-3)]+
705             localChunk[whichLocal][small(i-1,j,planes-3)]+
706             localChunk[whichLocal][small(i+1,j,planes-3)]+
707             localChunk[whichLocal][small(i,j-1,planes-3)]+
708             localChunk[whichLocal][small(i,j+1,planes-3)]+
709             localChunk[whichLocal][small(i,j,planes-4)]+
710             sendBuf[whichLocal][zn][facez(i+1,j+1)]
711         )/7;
712        eltsComp+=1;
713       }
714     }
715     
716     for(int j = 1; j < planes-3; j++){
717       for(int i = 1; i < rows-3; i++){
718         localChunk[newLocal][small(i,0,j)] = (
719             localChunk[whichLocal][small(i,0,j)]+
720             localChunk[whichLocal][small(i-1,0,j)]+
721             localChunk[whichLocal][small(i+1,0,j)]+
722             localChunk[whichLocal][small(i,0,j-1)]+
723             localChunk[whichLocal][small(i,0,j+1)]+
724             localChunk[whichLocal][small(i,1,j)]+
725             sendBuf[whichLocal][xp][facex(i+1,j+1)]
726         )/7;
727        eltsComp+=1;
728       }
729     }
730     for(int j = 1; j < planes-3; j++){
731       for(int i = 1; i < rows-3; i++){
732         localChunk[newLocal][small(i,cols-3,j)] = (
733             localChunk[whichLocal][small(i,cols-3,j)]+
734             localChunk[whichLocal][small(i-1,cols-3,j)]+
735             localChunk[whichLocal][small(i+1,cols-3,j)]+
736             localChunk[whichLocal][small(i,cols-3,j-1)]+
737             localChunk[whichLocal][small(i,cols-3,j+1)]+
738             localChunk[whichLocal][small(i,cols-4,j)]+
739             sendBuf[whichLocal][xn][facex(i+1,j+1)]
740         )/7;
741        eltsComp+=1;
742       }
743     }
744     
745     for(int j = 1; j < planes-3; j++){
746       for(int i = 1; i < cols-3; i++){
747         localChunk[newLocal][small(0,i,j)] = (
748             localChunk[whichLocal][small(0,i,j)]+
749             localChunk[whichLocal][small(0,i-1,j)]+
750             localChunk[whichLocal][small(0,i+1,j)]+
751             localChunk[whichLocal][small(0,i,j-1)]+
752             localChunk[whichLocal][small(0,i,j+1)]+
753             localChunk[whichLocal][small(1,i,j)]+
754             sendBuf[whichLocal][yp][facey(i+1,j+1)]
755         )/7;
756        eltsComp+=1;
757       }
758     }
759     for(int j = 1; j < planes-3; j++){
760       for(int i = 1; i < cols-3; i++){
761         localChunk[newLocal][small(rows-3,i,j)] = (
762             localChunk[whichLocal][small(rows-3,i,j)]+
763             localChunk[whichLocal][small(rows-3,i-1,j)]+
764             localChunk[whichLocal][small(rows-3,i+1,j)]+
765             localChunk[whichLocal][small(rows-3,i,j-1)]+
766             localChunk[whichLocal][small(rows-3,i,j+1)]+
767             localChunk[whichLocal][small(rows-4,i,j)]+
768             sendBuf[whichLocal][yn][facey(i+1,j+1)]
769         )/7;
770        eltsComp+=1;
771       }
772     }
773     /* Now we can compute the sendBufs. there are 6 faces.
774      Each element of sendbuf uses the ghost layer (recvbufs) in some way.
775      There are elements that :
776      a) form the interior of the faces. these use only one ghost element each
777      b) are the corners of the faces, that use 3 ghost elements each
778      c) are part of the edges, using 2 ghost elements each
779     */
780
781     // 1. zp face
782     // Interior points first
783     // enforce boundary conditions if plane = 0
784     if(plane == 0){
785       for(int i = 0; i < rows*cols; i++){
786         sendBuf[newLocal][zp][i] = 1.0;
787         eltsComp+=1;
788       }
789     }
790     else{
791     for(int i = 1; i < rows-1; i++){
792       for(int j = 1; j < cols-1; j++){
793         sendBuf[newLocal][zp][facez(i,j)] = (
794                 sendBuf[whichLocal][zp][facez(i,j)]+
795                 sendBuf[whichLocal][zp][facez(i-1,j)]+
796                 sendBuf[whichLocal][zp][facez(i+1,j)]+
797                 sendBuf[whichLocal][zp][facez(i,j-1)]+
798                 sendBuf[whichLocal][zp][facez(i,j+1)]+
799                 localChunk[whichLocal][small(i-1,j-1,0)]+
800                 recvBuf[ZP][facez(i,j)]
801         )/7;
802        eltsComp+=1;
803       }
804     }
805
806     // Corners next
807     // Each element uses 3 ghosts, 3 from its own sendBuf (including itself)  and 1 from a different sendBuf
808     sendBuf[newLocal][zp][facez(0,0)] = (
809           recvBuf[YP][facey(0,0)]+
810           recvBuf[XP][facex(0,0)]+
811           recvBuf[ZP][facez(0,0)]+
812           sendBuf[whichLocal][zp][facez(0,0)]+
813           sendBuf[whichLocal][zp][facez(0,1)]+
814           sendBuf[whichLocal][zp][facez(1,0)]+
815           sendBuf[whichLocal][xp][facex(0,1)]
816     )/7;
817
818     sendBuf[newLocal][zp][facez(0,cols-1)] = (
819           recvBuf[YP][facey(cols-1,0)]+
820           recvBuf[XN][facex(0,0)]+
821           recvBuf[ZP][facez(0,cols-1)]+
822           sendBuf[whichLocal][zp][facez(0,cols-1)]+
823           sendBuf[whichLocal][zp][facez(1,cols-1)]+
824           sendBuf[whichLocal][zp][facez(0,cols-2)]+
825           sendBuf[whichLocal][xn][facex(0,1)]
826     )/7;
827
828     sendBuf[newLocal][zp][facez(rows-1,0)] = (
829           recvBuf[ZP][facez(rows-1,0)]+
830           recvBuf[XP][facex(rows-1,0)]+
831           recvBuf[YN][facey(0,0)]+
832           sendBuf[whichLocal][zp][facez(rows-1,0)]+
833           sendBuf[whichLocal][zp][facez(rows-2,0)]+
834           sendBuf[whichLocal][zp][facez(rows-1,1)]+
835           sendBuf[whichLocal][xp][facex(rows-1,1)]
836     )/7;
837
838     sendBuf[newLocal][zp][facez(rows-1,cols-1)] = (
839           recvBuf[XN][facex(rows-1,0)]+
840           recvBuf[YN][facey(cols-1,0)]+
841           recvBuf[ZP][facez(rows-1,cols-1)]+
842           sendBuf[whichLocal][zp][facez(rows-1,cols-1)]+
843           sendBuf[whichLocal][zp][facez(rows-2,cols-1)]+
844           sendBuf[whichLocal][zp][facez(rows-1,cols-2)]+
845           sendBuf[whichLocal][xn][facex(rows-1,1)]
846     )/7;
847
848     eltsComp += 4;
849     // Finally, the edges: 
850     // Each element here uses 2 ghosts, 4 elts from its own sendBuf, 1 elt from a different sendBuf
851     
852     for(int i = 1; i < cols-1; i++){
853       sendBuf[newLocal][zp][facez(0,i)] = (
854           recvBuf[ZP][facez(0,i)]+
855           recvBuf[YP][facey(i,0)]+
856           sendBuf[whichLocal][zp][facez(0,i)]+
857           sendBuf[whichLocal][zp][facez(0,i-1)]+
858           sendBuf[whichLocal][zp][facez(0,i+1)]+
859           sendBuf[whichLocal][zp][facez(1,i)]+
860           sendBuf[whichLocal][yp][facey(i,1)]
861       )/7; 
862       eltsComp+=1;
863     }
864     for(int i = 1; i < rows-1; i++){
865       sendBuf[newLocal][zp][facez(i,0)] = (
866           recvBuf[XP][facex(i,0)]+
867           recvBuf[ZP][facez(i,0)]+
868           sendBuf[whichLocal][zp][facez(i,0)]+
869           sendBuf[whichLocal][zp][facez(i-1,0)]+
870           sendBuf[whichLocal][zp][facez(i+1,0)]+
871           sendBuf[whichLocal][zp][facez(i,1)]+
872           sendBuf[whichLocal][xp][facex(i,1)]
873       )/7;
874       eltsComp+=1;
875     }
876     for(int i = 1; i < cols-1; i++){
877       sendBuf[newLocal][zp][facez(rows-1,i)] = (
878           recvBuf[YN][facey(i,0)]+
879           recvBuf[ZP][facez(rows-1,i)]+
880           sendBuf[whichLocal][zp][facez(rows-1,i)]+
881           sendBuf[whichLocal][zp][facez(rows-1,i-1)]+
882           sendBuf[whichLocal][zp][facez(rows-1,i+1)]+
883           sendBuf[whichLocal][zp][facez(rows-2,i)]+
884           sendBuf[whichLocal][yn][facey(i,1)]
885       )/7;
886       eltsComp+=1;
887     }
888     for(int i = 1; i < rows-1; i++){
889       sendBuf[newLocal][zp][facez(i,cols-1)] = (
890           recvBuf[XN][facex(i,0)]+
891           recvBuf[ZP][facez(i,cols-1)]+
892           sendBuf[whichLocal][zp][facez(i,cols-1)]+
893           sendBuf[whichLocal][zp][facez(i-1,cols-1)]+
894           sendBuf[whichLocal][zp][facez(i+1,cols-1)]+
895           sendBuf[whichLocal][zp][facez(i,cols-2)]+
896           sendBuf[whichLocal][xn][facex(i,1)]
897       )/7;
898       eltsComp+=1;
899     }
900     }
901     
902     // 2. zn face
903     // Interior points first
904     for(int i = 1; i < rows-1; i++){
905       for(int j = 1; j < cols-1; j++){
906         sendBuf[newLocal][zn][facez(i,j)] = (
907                 sendBuf[whichLocal][zn][facez(i,j)]+
908                 sendBuf[whichLocal][zn][facez(i-1,j)]+
909                 sendBuf[whichLocal][zn][facez(i+1,j)]+
910                 sendBuf[whichLocal][zn][facez(i,j-1)]+
911                 sendBuf[whichLocal][zn][facez(i,j+1)]+
912                 localChunk[whichLocal][small(i-1,j-1,planes-3)]+
913                 recvBuf[ZN][facez(i,j)]
914         )/7;
915       eltsComp+=1;
916       }
917     }
918
919     // Corners next
920     // Each element uses 3 ghosts, 3 from its own sendBuf (including itself)  and 1 from a different sendBuf
921     sendBuf[newLocal][zn][facez(0,0)] = (
922           recvBuf[YP][facey(0,planes-1)]+
923           recvBuf[XP][facex(0,planes-1)]+
924           recvBuf[ZN][facez(0,0)]+
925           sendBuf[whichLocal][zn][facez(0,0)]+
926           sendBuf[whichLocal][zn][facez(0,1)]+
927           sendBuf[whichLocal][zn][facez(1,0)]+
928           sendBuf[whichLocal][xp][facex(0,planes-2)]
929     )/7;
930
931     sendBuf[newLocal][zn][facez(0,cols-1)] = (
932           recvBuf[YP][facey(cols-1,planes-1)]+
933           recvBuf[XN][facex(0,planes-1)]+
934           recvBuf[ZN][facez(0,cols-1)]+
935           sendBuf[whichLocal][zn][facez(0,cols-1)]+
936           sendBuf[whichLocal][zn][facez(1,cols-1)]+
937           sendBuf[whichLocal][zn][facez(0,cols-2)]+
938           sendBuf[whichLocal][xn][facex(0,cols-2)]
939     )/7;
940
941     sendBuf[newLocal][zn][facez(rows-1,0)] = (
942           recvBuf[ZN][facez(rows-1,0)]+
943           recvBuf[XP][facex(rows-1,planes-1)]+
944           recvBuf[YN][facey(0,planes-1)]+
945           sendBuf[whichLocal][zn][facez(rows-1,0)]+
946           sendBuf[whichLocal][zn][facez(rows-2,0)]+
947           sendBuf[whichLocal][zn][facez(rows-1,1)]+
948           sendBuf[whichLocal][xp][facex(rows-1,planes-2)]
949     )/7;
950
951     sendBuf[newLocal][zn][facez(rows-1,cols-1)] = (
952           recvBuf[XN][facex(rows-1,planes-1)]+
953           recvBuf[YN][facey(cols-1,planes-1)]+
954           recvBuf[ZN][facez(rows-1,cols-1)]+
955           sendBuf[whichLocal][zn][facez(rows-1,cols-1)]+
956           sendBuf[whichLocal][zn][facez(rows-2,cols-1)]+
957           sendBuf[whichLocal][zn][facez(rows-1,cols-2)]+
958           sendBuf[whichLocal][xn][facex(rows-1,planes-2)]
959     )/7;
960
961       eltsComp+=4;
962     // Finally, the edges: 
963     // Each element here uses 2 ghosts, 4 elts from its own sendBuf, 1 elt from a different sendBuf
964     
965     for(int i = 1; i < cols-1; i++){
966       sendBuf[newLocal][zn][facez(0,i)] = (
967           recvBuf[ZN][facez(0,i)]+
968           recvBuf[YP][facey(i,planes-1)]+
969           sendBuf[whichLocal][zn][facez(0,i)]+
970           sendBuf[whichLocal][zn][facez(0,i-1)]+
971           sendBuf[whichLocal][zn][facez(0,i+1)]+
972           sendBuf[whichLocal][zn][facez(1,i)]+
973           sendBuf[whichLocal][yp][facey(i,planes-2)]
974       )/7;
975       eltsComp+=1;
976     }
977     for(int i = 1; i < rows-1; i++){
978       sendBuf[newLocal][zn][facez(i,0)] = (
979           recvBuf[XP][facex(i,planes-1)]+
980           recvBuf[ZN][facez(i,0)]+
981           sendBuf[whichLocal][zn][facez(i,0)]+
982           sendBuf[whichLocal][zn][facez(i-1,0)]+
983           sendBuf[whichLocal][zn][facez(i+1,0)]+
984           sendBuf[whichLocal][zn][facez(i,1)]+
985           sendBuf[whichLocal][xp][facex(i,planes-2)]
986       )/7;
987       eltsComp+=1;
988     }
989     for(int i = 1; i < cols-1; i++){
990       sendBuf[newLocal][zn][facez(rows-1,i)] = (
991           recvBuf[YN][facey(i,planes-1)]+
992           recvBuf[ZN][facez(rows-1,i)]+
993           sendBuf[whichLocal][zn][facez(rows-1,i)]+
994           sendBuf[whichLocal][zn][facez(rows-1,i-1)]+
995           sendBuf[whichLocal][zn][facez(rows-1,i+1)]+
996           sendBuf[whichLocal][zn][facez(rows-2,i)]+
997           sendBuf[whichLocal][yn][facey(i,planes-2)]
998       )/7;
999       eltsComp+=1;
1000     }
1001     for(int i = 1; i < rows-1; i++){
1002       sendBuf[newLocal][zn][facez(i,cols-1)] = (
1003           recvBuf[XN][facex(i,planes-1)]+
1004           recvBuf[ZN][facez(i,cols-1)]+
1005           sendBuf[whichLocal][zn][facez(i,cols-1)]+
1006           sendBuf[whichLocal][zn][facez(i-1,cols-1)]+
1007           sendBuf[whichLocal][zn][facez(i+1,cols-1)]+
1008           sendBuf[whichLocal][zn][facez(i,cols-2)]+
1009           sendBuf[whichLocal][xn][facex(i,planes-2)]
1010       )/7;
1011       eltsComp+=1;
1012     }
1013     
1014     // 3. xp face
1015     // Interior points first
1016     for(int j = 1; j < planes-1; j++){
1017       for(int i = 1; i < rows-1; i++){
1018         sendBuf[newLocal][xp][facex(i,j)] = (
1019                 sendBuf[whichLocal][xp][facex(i,j)]+
1020                 sendBuf[whichLocal][xp][facex(i-1,j)]+
1021                 sendBuf[whichLocal][xp][facex(i+1,j)]+
1022                 sendBuf[whichLocal][xp][facex(i,j-1)]+
1023                 sendBuf[whichLocal][xp][facex(i,j+1)]+
1024                 localChunk[whichLocal][small(i-1,0,j-1)]+
1025                 recvBuf[XP][facex(i,j)]
1026         )/7;
1027       eltsComp+=1;
1028       }
1029     }
1030
1031     // Corners next
1032     sendBuf[newLocal][xp][facex(0,0)] = sendBuf[newLocal][zp][facez(0,0)];
1033     sendBuf[newLocal][xp][facex(0,planes-1)] = sendBuf[newLocal][zn][facez(0,0)];
1034     sendBuf[newLocal][xp][facex(rows-1,0)] = sendBuf[newLocal][zp][facez(rows-1,0)];
1035     sendBuf[newLocal][xp][facex(rows-1,planes-1)] = sendBuf[newLocal][zn][facez(rows-1,0)];
1036
1037     // Finally, the edges: 
1038     // Each element here uses 2 ghosts, 4 elts from its own sendBuf, 1 elt from a different sendBuf
1039     
1040     for(int i = 1; i < planes-1; i++){
1041       sendBuf[newLocal][xp][facex(0,i)] = (
1042           recvBuf[XP][facex(0,i)]+
1043           recvBuf[YP][facey(0,i)]+
1044           sendBuf[whichLocal][xp][facex(0,i)]+
1045           sendBuf[whichLocal][xp][facex(0,i-1)]+
1046           sendBuf[whichLocal][xp][facex(0,i+1)]+
1047           sendBuf[whichLocal][xp][facex(1,i)]+
1048           sendBuf[whichLocal][yp][facey(1,i)]
1049       )/7;
1050       eltsComp+=1;
1051     }
1052     for(int i = 1; i < rows-1; i++){
1053       sendBuf[newLocal][xp][facex(i,0)] = sendBuf[newLocal][zp][facez(i,0)];
1054     }
1055     for(int i = 1; i < planes-1; i++){
1056       sendBuf[newLocal][xp][facex(rows-1,i)] = (
1057           recvBuf[YN][facey(0,i)]+
1058           recvBuf[XP][facex(rows-1,i)]+
1059           sendBuf[whichLocal][xp][facex(rows-1,i)]+
1060           sendBuf[whichLocal][xp][facex(rows-1,i-1)]+
1061           sendBuf[whichLocal][xp][facex(rows-1,i+1)]+
1062           sendBuf[whichLocal][xp][facex(rows-2,i)]+
1063           sendBuf[whichLocal][yn][facey(1,i)]
1064       )/7;
1065       eltsComp+=1;
1066     }
1067     for(int i = 1; i < rows-1; i++){
1068       sendBuf[newLocal][xp][facex(i,planes-1)] = sendBuf[newLocal][zn][facez(i,0)]; 
1069     }
1070    
1071     // 4. xn face
1072     // Interior points first
1073     for(int j = 1; j < planes-1; j++){
1074       for(int i = 1; i < rows-1; i++){
1075         sendBuf[newLocal][xn][facex(i,j)] = (
1076                 sendBuf[whichLocal][xn][facex(i,j)]+
1077                 sendBuf[whichLocal][xn][facex(i-1,j)]+
1078                 sendBuf[whichLocal][xn][facex(i+1,j)]+
1079                 sendBuf[whichLocal][xn][facex(i,j-1)]+
1080                 sendBuf[whichLocal][xn][facex(i,j+1)]+
1081                 localChunk[whichLocal][small(i-1,cols-3,j-1)]+
1082                 recvBuf[XN][facex(i,j)]
1083         )/7;
1084       eltsComp+=1;
1085       }
1086     }
1087
1088     // Corners next
1089     sendBuf[newLocal][xn][facex(0,0)] = sendBuf[newLocal][zp][facez(0,cols-1)];
1090     sendBuf[newLocal][xn][facex(0,planes-1)] = sendBuf[newLocal][zn][facez(0,cols-1)];
1091     sendBuf[newLocal][xn][facex(rows-1,0)] = sendBuf[newLocal][zp][facez(rows-1,cols-1)];
1092     sendBuf[newLocal][xn][facex(rows-1,planes-1)] = sendBuf[newLocal][zn][facez(rows-1,cols-1)];
1093     
1094     // Finally, the edges: 
1095     // Each element here uses 2 ghosts, 4 elts from its own sendBuf, 1 elt from a different sendBuf
1096     
1097     for(int i = 1; i < planes-1; i++){
1098       sendBuf[newLocal][xn][facex(0,i)] = (
1099           recvBuf[XN][facex(0,i)]+
1100           recvBuf[YP][facey(cols-1,i)]+
1101           sendBuf[whichLocal][xn][facex(0,i)]+
1102           sendBuf[whichLocal][xn][facex(0,i-1)]+
1103           sendBuf[whichLocal][xn][facex(0,i+1)]+
1104           sendBuf[whichLocal][xn][facex(1,i)]+
1105           sendBuf[whichLocal][yp][facey(cols-2,i)]
1106       )/7;
1107       eltsComp+=1;
1108     }
1109     for(int i = 1; i < rows-1; i++){
1110       sendBuf[newLocal][xn][facex(i,0)] = sendBuf[newLocal][zp][facez(i,cols-1)]; 
1111     }
1112     for(int i = 1; i < planes-1; i++){
1113       sendBuf[newLocal][xn][facex(rows-1,i)] = (
1114           recvBuf[YN][facey(cols-1,i)]+
1115           recvBuf[XN][facex(rows-1,i)]+
1116           sendBuf[whichLocal][xn][facex(rows-1,i)]+
1117           sendBuf[whichLocal][xn][facex(rows-1,i-1)]+
1118           sendBuf[whichLocal][xn][facex(rows-1,i+1)]+
1119           sendBuf[whichLocal][xn][facex(rows-2,i)]+
1120           sendBuf[whichLocal][yn][facey(cols-2,i)]
1121       )/7;
1122       eltsComp+=1;
1123     }
1124     for(int i = 1; i < rows-1; i++){
1125       sendBuf[newLocal][xn][facex(i,planes-1)] = sendBuf[newLocal][zn][facez(i,cols-1)];
1126     }
1127     
1128  
1129     // 5. yp face
1130     // Interior points first
1131     for(int j = 1; j < planes-1; j++){
1132       for(int i = 1; i < cols-1; i++){
1133         sendBuf[newLocal][yp][facey(i,j)] = (
1134                 sendBuf[whichLocal][yp][facey(i,j)]+
1135                 sendBuf[whichLocal][yp][facey(i-1,j)]+
1136                 sendBuf[whichLocal][yp][facey(i+1,j)]+
1137                 sendBuf[whichLocal][yp][facey(i,j-1)]+
1138                 sendBuf[whichLocal][yp][facey(i,j+1)]+
1139                 localChunk[whichLocal][small(0,i-1,j-1)]+
1140                 recvBuf[YP][facey(i,j)]
1141         )/7;
1142       eltsComp+=1;
1143       }
1144     }
1145
1146     // Corners next
1147     // Each element uses 3 ghosts, 3 from its own sendBuf (including itself)  and 1 from a different sendBuf
1148     sendBuf[newLocal][yp][facey(0,0)] = sendBuf[newLocal][zp][facez(0,0)];
1149     sendBuf[newLocal][yp][facey(0,planes-1)] = sendBuf[newLocal][zn][facez(0,0)];
1150     sendBuf[newLocal][yp][facey(cols-1,0)] = sendBuf[newLocal][xn][facex(0,0)];
1151     sendBuf[newLocal][yp][facey(cols-1,planes-1)] = sendBuf[newLocal][zn][facez(0,cols-1)]; 
1152     
1153     // Finally, the edges: 
1154     // Each element here uses 2 ghosts, 4 elts from its own sendBuf, 1 elt from a different sendBuf
1155     
1156     for(int i = 1; i < planes-1; i++){
1157       sendBuf[newLocal][yp][facey(0,i)] = sendBuf[newLocal][xp][facex(0,i)]; 
1158     }
1159     for(int i = 1; i < cols-1; i++){
1160       sendBuf[newLocal][yp][facey(i,0)] = sendBuf[newLocal][zp][facez(0,i)];
1161     }
1162     for(int i = 1; i < planes-1; i++){
1163       sendBuf[newLocal][yp][facey(cols-1,i)] = sendBuf[newLocal][xn][facex(0,i)];
1164     }
1165     for(int i = 1; i < cols-1; i++){
1166       sendBuf[newLocal][yp][facey(i,planes-1)] = sendBuf[newLocal][zn][facez(0,i)];
1167     }
1168
1169     // 6. yn face
1170     // Interior points first
1171     for(int j = 1; j < planes-1; j++){
1172       for(int i = 1; i < cols-1; i++){
1173         sendBuf[newLocal][yn][facey(i,j)] = (
1174                 sendBuf[whichLocal][yn][facey(i,j)]+
1175                 sendBuf[whichLocal][yn][facey(i-1,j)]+
1176                 sendBuf[whichLocal][yn][facey(i+1,j)]+
1177                 sendBuf[whichLocal][yn][facey(i,j-1)]+
1178                 sendBuf[whichLocal][yn][facey(i,j+1)]+
1179                 localChunk[whichLocal][small(rows-3,i-1,j-1)]+
1180                 recvBuf[YN][facey(i,j)]
1181         )/7;
1182       eltsComp+=1;
1183       }
1184     }
1185
1186     // Corners next
1187     // Each element uses 3 ghosts, 3 from its own sendBuf (including itself)  and 1 from a different sendBuf
1188     sendBuf[newLocal][yn][facey(0,0)] = sendBuf[newLocal][zp][facez(rows-1,0)];
1189     sendBuf[newLocal][yn][facey(0,planes-1)] = sendBuf[newLocal][zn][facez(rows-1,0)];
1190     sendBuf[newLocal][yn][facey(cols-1,0)] = sendBuf[newLocal][zp][facez(rows-1,cols-1)];
1191     sendBuf[newLocal][yn][facey(cols-1,planes-1)] = sendBuf[newLocal][zn][facez(rows-1,cols-1)]; 
1192     
1193     // Finally, the edges: 
1194     // Each element here uses 2 ghosts, 4 elts from its own sendBuf, 1 elt from a different sendBuf
1195     
1196     for(int i = 1; i < planes-1; i++){
1197       sendBuf[newLocal][yn][facey(0,i)] = sendBuf[newLocal][xp][facex(rows-1,i)]; 
1198     }
1199     for(int i = 1; i < cols-1; i++){
1200       sendBuf[newLocal][yn][facey(i,0)] = sendBuf[newLocal][zp][facez(rows-1,i)];
1201     }
1202     for(int i = 1; i < planes-1; i++){
1203       sendBuf[newLocal][yn][facey(cols-1,i)] = sendBuf[newLocal][xn][facex(rows-1,i)];
1204     }
1205     for(int i = 1; i < cols-1; i++){
1206       sendBuf[newLocal][yn][facey(i,planes-1)] = sendBuf[newLocal][zn][facez(rows-1,i)];
1207     }
1208
1209
1210     
1211     
1212     // exclude the time for setup and the first iteration
1213     iterations++;
1214     if(iterations == 1){
1215       contribute(0,0,CkReduction::concat, CkCallback(CkIndex_Main::doneSetup(), mainProxy));
1216     }
1217     
1218     // toggle between localChunks
1219     whichLocal = newLocal;
1220     if(iterations == num_iterations){
1221 #ifdef STENCIL2D_VERBOSE
1222       CkPrintf("(%d,%d): contributing to exit\n", row, col);
1223 #endif
1224       contribute(sizeof(double), &eltsComp, CkReduction::sum_double, CkCallback(CkIndex_Main::done(NULL), mainProxy));
1225     }
1226     else{
1227 #ifdef USE_CKDIRECT
1228       // 2. signal readiness to recv next round of data
1229       for(int i = 0; i < NBRS; i++){
1230         CkDirect_ready(&rhandles[i]);
1231       }
1232 #endif
1233       // contribute to barrier
1234 #ifdef STENCIL2D_VERBOSE
1235       CkPrintf("(%d,%d): contributing to allReady\n", row, col);
1236 #endif
1237       contribute(0, 0, CkReduction::concat, CkCallback(CkIndex_StencilPoint::allReadyCallback(NULL), thisProxy));
1238     }
1239     
1240     if(iterations > num_iterations){
1241       CkPrintf("******\n(%d,%d,%d):\n******\n", row, col,plane);
1242       CkAbort("death is inevitable; bugs needn't be.\n");
1243     }
1244   }
1245
1246   void allReadyCallback(CkReductionMsg *msg){
1247     delete msg;
1248 #ifdef STENCIL2D_VERBOSE
1249     CkPrintf("(%d,%d,%d): all ready, send data\n", row, col,plane);
1250 #endif
1251     sendData();
1252   }
1253
1254   inline void sendMsg(int col, int row, int plane, int which, float *buf){
1255     StencilMsg *msg = new (payload[which/2]) StencilMsg;
1256     memcpy(msg->arr,buf,payload[which/2]*sizeof(float));
1257     msg->which = which;
1258     msg->size = payload[which/2];
1259     thisProxy[lin(col,row,plane)].recvBufferMsg(msg);
1260   }
1261
1262   void sendData(){
1263     // 1. copy data into buffers from local chunk
1264     // top and bottom boundaries
1265 #ifdef STENCIL2D_VERBOSE
1266     //CkPrintf("(%d,%d,%d): sendData() called\n", x,y,z);
1267 #endif
1268 #ifdef USE_CKDIRECT
1269     // 2. put buffers
1270     for(int i = 0; i < NBRS; i++){
1271       CkDirect_put(&shandles[whichLocal][i]);
1272     }
1273 #else
1274 #ifdef USE_MESSAGES
1275 #ifdef ARR_CHECK
1276     sendMsg((col+1)%charesx,row,plane,XP,sendBuf[whichLocal][xn].getVec());
1277     sendMsg((col-1+charesx)%charesx,row,plane,XN,sendBuf[whichLocal][xp].getVec());
1278     sendMsg(col,(row+1)%charesy,plane,YP,sendBuf[whichLocal][yn].getVec());
1279     sendMsg(col,(row-1+charesy)%charesy,plane,YN,sendBuf[whichLocal][yp].getVec());
1280     sendMsg(col,row,(plane+1)%charesz,ZP,sendBuf[whichLocal][zn].getVec());
1281     sendMsg(col,row,(plane-1+charesz)%charesz,ZN,sendBuf[whichLocal][zp].getVec());
1282 #else
1283     sendMsg((col+1)%charesx,row,plane,XP,sendBuf[whichLocal][xn]);
1284     sendMsg((col-1+charesx)%charesx,row,plane,XN,sendBuf[whichLocal][xp]);
1285     sendMsg(col,(row+1)%charesy,plane,YP,sendBuf[whichLocal][yn]);
1286     sendMsg(col,(row-1+charesy)%charesy,plane,YN,sendBuf[whichLocal][yp]);
1287     sendMsg(col,row,(plane+1)%charesz,ZP,sendBuf[whichLocal][zn]);
1288     sendMsg(col,row,(plane-1+charesz)%charesz,ZN,sendBuf[whichLocal][zp]);
1289 #endif
1290 #else
1291 #ifdef ARR_CHECK
1292     // 2. send messages
1293     thisProxy[lin((col+1)%charesx,row,plane)].recvBuffer(sendBuf[whichLocal][xn].getVec(),payload[0],XP);
1294     thisProxy[lin((col-1+charesx)%charesx,row,plane)].recvBuffer(sendBuf[whichLocal][xp].getVec(),payload[0],XN);
1295     thisProxy[lin(col,(row+1)%charesy,plane)].recvBuffer(sendBuf[whichLocal][yn].getVec(),payload[1],YP);
1296     thisProxy[lin(col,(row-1+charesy)%charesy,plane)].recvBuffer(sendBuf[whichLocal][yp].getVec(),payload[1],YN);
1297     thisProxy[lin(col,row,(plane+1)%charesz)].recvBuffer(sendBuf[whichLocal][zn].getVec(),payload[2],ZP);
1298     thisProxy[lin(col,row,(plane-1+charesz)%charesz)].recvBuffer(sendBuf[whichLocal][zp].getVec(),payload[2],ZN);
1299 #else
1300     thisProxy[lin((col+1)%charesx,row,plane)].recvBuffer(sendBuf[whichLocal][xn],payload[0],XP);
1301     thisProxy[lin((col-1+charesx)%charesx,row,plane)].recvBuffer(sendBuf[whichLocal][xp],payload[0],XN);
1302     thisProxy[lin(col,(row+1)%charesy,plane)].recvBuffer(sendBuf[whichLocal][yn],payload[1],YP);
1303     thisProxy[lin(col,(row-1+charesy)%charesy,plane)].recvBuffer(sendBuf[whichLocal][yp],payload[1],YN);
1304     thisProxy[lin(col,row,(plane+1)%charesz)].recvBuffer(sendBuf[whichLocal][zn],payload[2],ZP);
1305     thisProxy[lin(col,row,(plane-1+charesz)%charesz)].recvBuffer(sendBuf[whichLocal][zp],payload[2],ZN);
1306 #endif
1307 #endif // end USE_MESSAGES
1308 #endif
1309   }
1310
1311 };
1312
1313 #include "stencil3d.def.h"