Merge branch 'charm' into amicableErrorMessages
[charm.git] / examples / multiphaseSharedArrays / matmul / t2d.C
1 // -*- mode: c++; tab-width: 4 -*-
2
3 // When running 1D, make NEPP = COL1
4 // When running 2D, same
5 // When running 3D, make NEPP = subset of COL1
6
7 #include "nepp.h"
8 #include "msa/msa.h"
9
10 #ifdef PUP_EVERY
11 typedef MSA2D<double, DefaultEntry<double,true>, NEPP, MSA_ROW_MAJOR> MSA2DRowMjr;
12 #ifdef OLD
13 typedef MSA2D<double, DefaultEntry<double,true>, NEPP, MSA_COL_MAJOR> MSA2DColMjr;
14 #else
15 typedef MSA2D<double, DefaultEntry<double,true>, NEPP, MSA_ROW_MAJOR> MSA2DColMjr;
16 #endif
17 typedef MSA2D<double, DefaultEntry<double,true>, NEPP_C, MSA_ROW_MAJOR> MSA2DRowMjrC;
18
19 #else
20 typedef MSA2D<double, DefaultEntry<double,false>, NEPP, MSA_ROW_MAJOR> MSA2DRowMjr;
21 #ifdef OLD
22 typedef MSA2D<double, DefaultEntry<double,false>, NEPP, MSA_COL_MAJOR> MSA2DColMjr;
23 #else
24 typedef MSA2D<double, DefaultEntry<double,false>, NEPP, MSA_ROW_MAJOR> MSA2DColMjr;
25 #endif
26 typedef MSA2D<double, DefaultEntry<double,false>, NEPP_C, MSA_ROW_MAJOR> MSA2DRowMjrC;
27 #endif
28
29 #include "t2d.decl.h"
30
31 #include <assert.h>
32 #include <math.h>
33 #include "params.h"
34
35 const double epsilon = 0.00000001;
36 inline int notequal(double v1, double v2)
37 {
38     return (fabs(v1 - v2) > epsilon);
39 }
40
41 static void perfCheck(int expected, int actual)
42 {
43   if (expected != actual)
44         ckout << "Arguments don't line up with compiled in defaults." << endl
45                   << expected << " != " << actual << endl
46                   << " Performance may suffer" << endl;
47 }
48
49 class t2d : public CBase_t2d
50 {
51 protected:
52     double start_time;
53     CProxy_TestArray workers;
54     int reallyDone;
55
56 public:
57     t2d(CkArgMsg* m)
58     {
59         // Usage: a.out number_of_worker_threads max_bytes ROWS1 ROWS2 COLS2 DECOMP-D TIMING-DETAIL?
60         if(m->argc >1 ) NUM_WORKERS=atoi(m->argv[1]);
61         if(m->argc >2 ) bytes=atoi(m->argv[2]);
62         if(m->argc >3 ) ROWS1=atoi(m->argv[3]);
63         if(m->argc >4 ) ROWS2=COLS1=atoi(m->argv[4]);
64         if(m->argc >5 ) COLS2=atoi(m->argv[5]);
65         if(m->argc >6 ) DECOMPOSITION=atoi(m->argv[6]); // 1D, 2D, 3D
66         if(m->argc >7 ) detailedTimings= ((atoi(m->argv[7])!=0)?true:false);
67         delete m;
68         reallyDone = 0;
69
70         MSA2DRowMjr arr1(ROWS1, COLS1, NUM_WORKERS, bytes);        // row major
71         MSA2DColMjr arr2(ROWS2, COLS2, NUM_WORKERS, bytes);        // column major
72         MSA2DRowMjrC prod(ROWS1, COLS2, NUM_WORKERS, bytes);        // product matrix
73
74         workers = CProxy_TestArray::ckNew(arr1, arr2, prod, NUM_WORKERS, NUM_WORKERS);
75         workers.ckSetReductionClient(new CkCallback(CkIndex_t2d::done(NULL), thisProxy));
76
77         start_time = CkWallTimer();
78         workers.Start();
79     }
80
81     // This method gets called twice, and should only terminate the
82     // second time.
83     void done(CkReductionMsg* m)
84     {
85         int *ip = (int*)m->getData();
86         bool prefetchWorked = (*ip==0);
87         delete m;
88
89         if (reallyDone == 0) {
90             workers.Kontinue();
91             reallyDone++;
92
93             double end_time = CkWallTimer();
94
95             const char TAB = '\t';
96
97             char hostname[100];
98             gethostname(hostname, 100);
99
100             ckout << CkNumPes() << TAB
101                                   << ROWS1 << TAB
102                   << COLS1 << TAB
103                   << ROWS2 << TAB
104                   << COLS2 << TAB
105                   << NUM_WORKERS << TAB
106                   << bytes << TAB
107                   << (runPrefetchVersion? (prefetchWorked?"Y":"N"): "U") << TAB
108                   << end_time - start_time << TAB
109                   << NEPP << TAB
110                   << DECOMPOSITION << TAB
111                   << hostname
112                   << endl;
113
114         } else {
115             CkExit();
116         }
117     }
118 };
119
120 // get the chunk for a given index
121 int GetChunkForIndex(int index, int maxIndex, int numWorkers)
122 {
123     int rangeSize = maxIndex / numWorkers;
124     int chunk;
125
126     // find which chare is going to process the current node
127     if(index <= (maxIndex % numWorkers) * (rangeSize + 1) - 1)
128         chunk = index/(rangeSize + 1);
129     else
130         chunk = maxIndex%numWorkers + (index - (maxIndex%numWorkers) * (rangeSize + 1))/rangeSize;
131
132     return chunk;
133 }
134
135 // Returns start and end
136 void GetMyIndices(unsigned int maxIndex, unsigned int myNum, unsigned int numWorkers,
137                   unsigned int& start, unsigned int& end)
138 {
139     int rangeSize = maxIndex / numWorkers;
140     if(myNum < maxIndex % numWorkers)
141     {
142         start = myNum * (rangeSize + 1);
143         end = start + rangeSize;
144     }
145     else
146     {
147         start = myNum * rangeSize + maxIndex % numWorkers;
148         end = start + rangeSize - 1;
149     }
150 }
151
152 // class MatmulHelper {
153 // public:
154 //     unsigned int iStart, iEnd, jStart, jEnd, kStart, kEnd;
155 //     MatmulHelper(unsigned int ROWS1_, unsigned int COLS1_, unsigned int COLS2_)
156 //         : iStart(0), iEnd(ROWS1_-1),  // A's rows
157 //           kStart(0), kEnd(COLS1_-1),  // inner
158 //           jStart(0), jEnd(COLS2_-1)   // B's cols
159 //     {}
160 // }
161
162 // class MatmulHelper1D {
163     
164 // }
165
166 class TestArray : public CBase_TestArray
167 {
168 private:
169     // prefetchWorked keeps track of whether the prefetches succeeded or not.
170     bool prefetchWorked;
171     CkVec<double> times;
172     CkVec<const char*> description;
173
174     // ================================================================
175     // 2D calculations
176
177     inline int numWorkers2D() {
178         static int n = 0;
179
180         if (n==0) {
181             n = (int)(sqrt(numWorkers));
182             CkAssert(n*n == numWorkers);
183         }
184
185         return n;
186     }
187
188     // Convert a 1D ChareArray index into a 2D x dimension index
189     inline unsigned int toX() {
190         return thisIndex/numWorkers2D();
191     }
192     // Convert a 1D ChareArray index into a 2D y dimension index
193     inline unsigned int toY() {
194         return thisIndex%numWorkers2D();
195     }
196
197     // ================================================================
198     // 3D calculations
199     inline int numWorkers3D() {
200         static int n = 0;
201
202         if (n==0) {
203             n = (int)(cbrt(numWorkers));
204             CkAssert(n*n*n == numWorkers);
205         }
206
207         return n;
208     }
209
210     // Convert a 1D ChareArray index into a 3D x dimension index
211     inline unsigned int toX3D() {
212         int b = (numWorkers3D()*numWorkers3D());
213         return thisIndex/b;
214     }
215     // Convert a 1D ChareArray index into a 3D y dimension index
216     inline unsigned int toY3D() {
217         int b = (numWorkers3D()*numWorkers3D());
218         return (thisIndex%b)/numWorkers3D();
219     }
220     // Convert a 1D ChareArray index into a 3D z dimension index
221     inline unsigned int toZ3D() {
222         int b = (numWorkers3D()*numWorkers3D());
223         return (thisIndex%b)%numWorkers3D();
224     }
225
226     // ================================================================
227
228 protected:
229     MSA2DRowMjr arr1;       // row major
230         MSA2DRowMjr::Read *h1;
231     MSA2DColMjr arr2;       // column major
232         MSA2DColMjr::Read *h2;
233     MSA2DRowMjrC prod;       // product matrix
234         MSA2DRowMjrC::Handle *hp;
235
236     unsigned int rows1, rows2, cols1, cols2, numWorkers;
237
238     void EnrollArrays()
239     {
240         arr1.enroll(numWorkers); // barrier
241         arr2.enroll(numWorkers); // barrier
242         prod.enroll(numWorkers); // barrier
243     }
244
245     void FillArrays(MSA2DRowMjr::Write &w1, MSA2DColMjr::Write &w2)
246     {
247         // fill in our portion of the array
248         unsigned int rowStart, rowEnd, colStart, colEnd;
249         GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
250         GetMyIndices(cols2, thisIndex, numWorkers, colStart, colEnd);
251
252         // fill them in with 1
253         for(unsigned int r = rowStart; r <= rowEnd; r++)
254             for(unsigned int c = 0; c < cols1; c++)
255                 w1.set(r, c) = 1.0;
256
257         for(unsigned int c = colStart; c <= colEnd; c++)
258             for(unsigned int r = 0; r < rows2; r++)
259                 w2.set(r, c) = 1.0;
260
261     }
262
263     void FillArrays2D(MSA2DRowMjr::Write &w1, MSA2DColMjr::Write &w2)
264     {
265         unsigned int rowStart, rowEnd, colStart, colEnd;
266         unsigned int r, c;
267
268         // fill in our portion of the A matrix
269         GetMyIndices(rows1, toX(), numWorkers2D(), rowStart, rowEnd);
270         GetMyIndices(cols1, toY(), numWorkers2D(), colStart, colEnd);
271         // CkPrintf("p%dw%d: FillArray2D A = %d %d %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd, colStart, colEnd);
272
273         // fill them in with 1
274         for(r = rowStart; r <= rowEnd; r++)
275             for(c = colStart; c <= colEnd; c++)
276                 w1.set(r, c) = 1.0;
277
278         // fill in our portion of the B matrix
279         GetMyIndices(rows2, toX(), numWorkers2D(), rowStart, rowEnd);
280         GetMyIndices(cols2, toY(), numWorkers2D(), colStart, colEnd);
281         // CkPrintf("p%dw%d: FillArray2D B = %d %d %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd, colStart, colEnd);
282         // fill them in with 1
283         for(r = rowStart; r <= rowEnd; r++)
284             for(c = colStart; c <= colEnd; c++)
285                 w2.set(r, c) = 1.0;
286     }
287
288     void TestResults(MSA2DRowMjr::Read &r1, MSA2DColMjr::Read &r2, MSA2DRowMjrC::Read &rp,
289                                          bool prod_test=true)
290     {
291         int errors = 0;
292         bool ok=true;
293
294         // verify the results, print out first error only
295         ok=true;
296         for(unsigned int r = 0; ok && r < rows1; r++) {
297             for(unsigned int c = 0; ok && c < cols1; c++) {
298                 if(notequal(r1.get(r, c), 1.0)) {
299                     ckout << "[" << CkMyPe() << "," << thisIndex << "] arr1 -- Illegal element at (" << r << "," << c << ") " << r1.get(r,c) << endl;
300                     ok=false;
301                     errors++;
302                 }
303             }
304         }
305
306         ok=true;
307         for(unsigned int c = 0; ok && c < cols2; c++) {
308             for(unsigned int r = 0; ok && r < rows2; r++) {
309                 if(notequal(r2.get(r, c), 1.0)) {
310                     ckout << "[" << CkMyPe() << "," << thisIndex << "] arr2 -- Illegal element at (" << r << "," << c << ") " << r2.get(r,c) << endl;
311                     ok=false;
312                     errors++;
313                 }
314             }
315         }
316
317         //arr1.FreeMem();
318         //arr2.FreeMem();
319
320         if(prod_test)
321         {
322             ok = true;
323             for(unsigned int c = 0; ok && c < cols2; c++) {
324                 for(unsigned int r = 0; ok && r < rows1; r++) {
325                     if(notequal(rp.get(r,c), 1.0 * cols1)) {
326                         ckout << "[" << CkMyPe() << "] result  -- Illegal element at (" << r << "," << c << ") " << rp.get(r,c) << endl;
327                         ok=false;
328                         errors++;
329                     }
330                 }
331             }
332         }
333
334         if (errors!=0) CkAbort("Incorrect array elements detected!");
335     }
336
337     void Contribute()
338     {
339         int dummy = prefetchWorked?0:1;
340         contribute(sizeof(int), &dummy, CkReduction::sum_int);
341     }
342
343     // ============================= 1D ===================================
344
345     void FindProductNoPrefetch(MSA2DRowMjr::Read &r1,
346                                                            MSA2DColMjr::Read &r2,
347                                                            MSA2DRowMjrC::Write &wp)
348         {
349 #ifdef OLD
350         FindProductNoPrefetchNMK(r1, r2, wp);
351 #else
352         FindProductNoPrefetchMKN_RM(r1, r2, wp);
353 #endif
354     }
355
356     // new, but bad perf
357     // improved perf by taking the prod.accu out of the innermost loop, up 2
358     // further improved perf by taking the arr1.get out of the innermost loop, up 1.
359     void FindProductNoPrefetchMKN_RM(MSA2DRowMjr::Read &r1,
360                                                                          MSA2DColMjr::Read &r2,
361                                                                          MSA2DRowMjrC::Write &wp)
362     {
363         CkAssert(arr2.getArrayLayout() == MSA_ROW_MAJOR);
364 //         CkPrintf("reached\n");
365         unsigned int rowStart, rowEnd;
366         GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
367
368         double *result = new double[cols2];
369         for(unsigned int r = rowStart; r <= rowEnd; r++) { // M
370             for(unsigned int c = 0; c < cols2; c++)
371                 result[c] = 0;
372             for(unsigned int k = 0; k < cols1; k++) { // K
373                 double a = r1.get(r,k);
374                 for(unsigned int c = 0; c < cols2; c++) { // N
375                     result[c] += a * r2.get(k,c);
376 //                     prod.set(r,c) = result; // @@ to see if accu is the delay
377 //                     prod.accumulate(prod.getIndex(r,c), result);
378                 }
379 //              assert(!notequal(result, 1.0*cols1));
380             }
381             for(unsigned int c = 0; c < cols2; c++) {
382                 wp.set(r,c) = result[c];
383             }
384         }
385         delete [] result;
386     }
387
388     // old
389     void FindProductNoPrefetchNMK(MSA2DRowMjr::Read &r1,
390                                                                   MSA2DColMjr::Read &r2,
391                                                                   MSA2DRowMjrC::Write &wp)
392     {
393         unsigned int rowStart, rowEnd;
394         GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
395
396         for(unsigned int c = 0; c < cols2; c++) { // N
397             for(unsigned int r = rowStart; r <= rowEnd; r++) { // M
398
399                 double result = 0.0;
400                 for(unsigned int k = 0; k < cols1; k++) { // K
401                     double e1 = r1.get(r,k);
402                     double e2 = r2.get(k,c);
403                     result += e1 * e2;
404                 }
405 //              assert(!notequal(result, 1.0*cols1));
406
407                 wp.set(r,c) = result;
408             }
409         }
410     }
411
412     // Assumes that the nepp equals the size of a row, i.e. NEPP == COLS1 == ROWS2
413     void FindProductNoPrefetchStripMined(MSA2DRowMjrC::Write &wp)
414     {
415         FindProductNoPrefetchStripMinedMKN_ROWMJR(wp);
416     }
417
418     // Assumes that the nepp equals the size of a row, i.e. NEPP == COLS1 == ROWS2
419     void FindProductNoPrefetchStripMinedNMK(MSA2DRowMjrC::Write &wp)
420     {
421             perfCheck(NEPP, cols1);
422         unsigned int rowStart, rowEnd;
423         GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
424         CkPrintf("p%dw%d: FPNP2DSM A = %d %d %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd, 0, cols2-1);
425
426         double time1 = CmiWallTimer();
427         for(unsigned int c = 0; c < cols2; c++) { // N
428             for(unsigned int r = rowStart; r <= rowEnd; r++) {  // M
429
430                 double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
431                 double* b = &(arr2.getPageBottom(arr2.getIndex(0,c),Read_Fault)); // ptr to col of B
432                 double result = 0.0;
433                 for(unsigned int k = 0; k < cols1; k++) { // K
434                     double e1 = a[k];  // no get
435                     double e2 = b[k];  // no get
436                     result += e1 * e2;
437                 }
438 //              assert(!notequal(result, 1.0*cols1));
439
440                 wp.set(r,c) = result;
441             }
442         }
443
444         double time2 = CmiWallTimer();
445         CkPrintf("timings %f \n", time2-time1);
446     }
447
448     // Assumes that the nepp equals the size of a row, i.e. NEPP == COLS1 == ROWS2
449     // Assumes CkAssert(NEPP_C == cols2);
450     void FindProductNoPrefetchStripMinedMKN_ROWMJR(MSA2DRowMjrC::Write &wp)
451     {
452             perfCheck(NEPP, cols1);
453             perfCheck(NEPP_C, cols2);
454         CkAssert(arr2.getArrayLayout() == MSA_ROW_MAJOR);
455         unsigned int rowStart, rowEnd;
456         GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
457 //         CkPrintf("p%dw%d: FPNP1DSM_MKN_RM A = %d %d\n", CkMyPe(), thisIndex, rowStart, rowEnd);
458
459         for(unsigned int r = rowStart; r <= rowEnd; r++) {  // M
460             double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
461             for(unsigned int c = 0; c < cols2; c++) { // N
462                 wp.set(r,c);  // just mark it as updated, need a better way
463             }
464             double* cm = &(prod.getPageBottom(prod.getIndex(r,0),Write_Fault)); // ptr to row of C
465             for(unsigned int k = 0; k < cols1; k++) { // K
466                 double* b = &(arr2.getPageBottom(arr2.getIndex(k,0),Read_Fault)); // ptr to row of B
467                 for(unsigned int c = 0; c < cols2; c++) { // N
468                     cm[c] += a[k]*b[c];
469 //                     prod.accumulate(prod.getIndex(r,c), );
470                 }
471             }
472                         if (r%4==0) CthYield();
473         }
474     }
475
476     void FindProductWithPrefetch(MSA2DRowMjr::Read &r1,
477                                                                  MSA2DColMjr::Read &r2,
478                                                                  MSA2DRowMjrC::Write &wp)
479     {
480         // fill in our portion of the array
481         unsigned int rowStart, rowEnd;
482         GetMyIndices(rows1, thisIndex, numWorkers, rowStart, rowEnd);
483
484         arr1.Unlock(); arr2.Unlock();
485         prefetchWorked = false;
486
487         arr1.Prefetch(rowStart, rowEnd);
488         arr2.Prefetch(0, cols2);
489
490         /* if prefetch fails, switch to non-prefetching version */
491         if(arr1.WaitAll())
492         {
493             if(verbose) ckout << thisIndex << ": Out of buffer in prefetch 1" << endl;
494             FindProductNoPrefetch(r1, r2, wp);
495             return;
496         }
497
498         if(arr2.WaitAll())
499         {
500             if(verbose) ckout << thisIndex << ": Out of buffer in prefetch 2" << endl;
501             FindProductNoPrefetch(r1, r2, wp);
502             return;
503         }
504
505         prefetchWorked = true;
506
507         for(unsigned int c = 0; c < cols2; c++)
508         {
509             for(unsigned int r = rowStart; r <= rowEnd; r++)
510             {
511                 double result = 0.0;
512                 for(unsigned int k = 0; k < cols1; k++)
513                 {
514                     double e1 = r1.get2(r,k);
515                     double e2 = r2.get2(k,c);
516                     result += e1 * e2;
517                 }
518
519                 //ckout << "[" << r << "," << c << "] = " << result << endl;
520
521                 wp(r,c) = result;
522             }
523             //ckout << thisIndex << "." << endl;
524         }
525
526         //arr1.Unlock(); arr2.Unlock();
527     }
528
529     // ============================= 2D ===================================
530     void FindProductNoPrefetch2DStripMined(MSA2DRowMjrC::Write &wp)
531     {
532         perfCheck(NEPP, cols1);
533         unsigned int rowStart, rowEnd, colStart, colEnd;
534         // fill in our portion of the C matrix
535         GetMyIndices(rows1, toX(), numWorkers2D(), rowStart, rowEnd);
536         GetMyIndices(cols2, toY(), numWorkers2D(), colStart, colEnd);
537
538         for(unsigned int c = colStart; c <= colEnd; c++) {
539             for(unsigned int r = rowStart; r <= rowEnd; r++) {
540
541                 double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
542                 double* b = &(arr2.getPageBottom(arr2.getIndex(0,c),Read_Fault)); // ptr to col of B
543
544                 double result = 0.0;
545                 for(unsigned int k = 0; k < cols1; k++) {
546                     double e1 = a[k];
547                     double e2 = b[k];
548                     result += e1 * e2;
549                 }
550 //              assert(!notequal(result, 1.0*cols1));
551
552                 wp.set(r,c) = result;
553             }
554         }
555     }
556
557     void FindProductNoPrefetch2D(MSA2DRowMjr::Read &r1,
558                                                                  MSA2DColMjr::Read &r2,
559                                                                  MSA2DRowMjrC::Write &wp)
560     {
561         unsigned int rowStart, rowEnd, colStart, colEnd;
562         // fill in our portion of the C matrix
563         GetMyIndices(rows1, toX(), numWorkers2D(), rowStart, rowEnd);
564         GetMyIndices(cols2, toY(), numWorkers2D(), colStart, colEnd);
565
566         for(unsigned int c = colStart; c <= colEnd; c++) {
567             for(unsigned int r = rowStart; r <= rowEnd; r++) {
568
569                 double result = 0.0;
570                 for(unsigned int k = 0; k < cols1; k++) {
571                     double e1 = r1.get(r,k);
572                     double e2 = r2.get(k,c);
573                     result += e1 * e2;
574                 }
575 //              assert(!notequal(result, 1.0*cols1));
576
577                 wp.set(r,c) = result;
578             }
579         }
580     }
581
582     // ============================= 3D ===================================
583     void FindProductNoPrefetch3D(MSA2DRowMjr::Read &r1,
584                                                                  MSA2DColMjr::Read &r2,
585                                                                  MSA2DRowMjrC::Accum &ap)
586     {
587         unsigned int rowStart, rowEnd, colStart, colEnd, kStart, kEnd;
588         // fill in our portion of the C matrix
589         GetMyIndices(rows1, toX3D(), numWorkers3D(), rowStart, rowEnd);
590         GetMyIndices(cols2, toY3D(), numWorkers3D(), colStart, colEnd);
591         GetMyIndices(cols1, toZ3D(), numWorkers3D(), kStart, kEnd);
592
593         for(unsigned int c = colStart; c <= colEnd; c++) {
594             for(unsigned int r = rowStart; r <= rowEnd; r++) {
595
596                 double result = 0.0;
597                 for(unsigned int k = kStart; k <= kEnd; k++) {
598                     double e1 = r1.get(r,k);
599                     double e2 = r2.get(k,c);
600                     result += e1 * e2;
601                 }
602 //              assert(!notequal(result, 1.0*cols1));
603
604                 ap(r,c) += result;
605             }
606         }
607     }
608
609     void FindProductNoPrefetch3DStripMined(MSA2DRowMjrC::Accum &ap)
610     {
611         perfCheck(NEPP, cols1);
612                 unsigned int rowStart, rowEnd, colStart, colEnd, kStart, kEnd;
613         // fill in our portion of the C matrix
614         GetMyIndices(rows1, toX3D(), numWorkers3D(), rowStart, rowEnd);
615         GetMyIndices(cols2, toY3D(), numWorkers3D(), colStart, colEnd);
616         GetMyIndices(cols1, toZ3D(), numWorkers3D(), kStart, kEnd);
617
618         for(unsigned int c = colStart; c <= colEnd; c++) {
619             for(unsigned int r = rowStart; r <= rowEnd; r++) {
620
621                 double* a = &(arr1.getPageBottom(arr1.getIndex(r,0),Read_Fault)); // ptr to row of A
622                 double* b = &(arr2.getPageBottom(arr2.getIndex(0,c),Read_Fault)); // ptr to col of B
623                 double result = 0.0;
624                 for(unsigned int k = kStart; k <= kEnd; k++) {
625                     double e1 = a[k];  // no get
626                     double e2 = b[k];  // no get
627                     result += e1 * e2;
628                 }
629 //              assert(!notequal(result, 1.0*cols1));
630
631                 ap(r,c) += result;
632             }
633         }
634     }
635
636     // ================================================================
637
638 public:
639     TestArray(const MSA2DRowMjr &arr1_, const MSA2DColMjr &arr2_, MSA2DRowMjrC &prod_,
640               unsigned int numWorkers_)
641         : arr1(arr1_), arr2(arr2_), prod(prod_), numWorkers(numWorkers_), prefetchWorked(false),
642           rows1(arr1.getRows()), cols1(arr1.getCols()),
643           rows2(arr2.getRows()), cols2(arr2.getCols())
644     {
645         // ckout << "w" << thisIndex << ":" << rows1 << " " << cols1 << " " << cols2 << endl;
646         times.push_back(CkWallTimer()); // 1
647         description.push_back("constr");
648     }
649
650     TestArray(CkMigrateMessage* m) {}
651
652     ~TestArray()
653     {
654     }
655
656     void Start()
657     {
658         times.push_back(CkWallTimer()); // 2
659         description.push_back("   start");
660
661         EnrollArrays();
662         times.push_back(CkWallTimer()); // 3
663         description.push_back("   enroll");
664
665                 MSA2DRowMjr::Write &w1 = arr1.getInitialWrite();
666                 MSA2DColMjr::Write &w2 = arr2.getInitialWrite();
667
668         if(verbose) ckout << thisIndex << ": filling" << endl;
669         switch(DECOMPOSITION){
670         case 1:
671         case 3:
672         case 4:
673         case 6:
674             FillArrays(w1, w2);
675             break;
676         case 2:
677         case 5:
678             FillArrays2D(w1, w2);
679             break;
680         }
681         times.push_back(CkWallTimer()); // 4
682         description.push_back("  fill");
683
684         if(verbose) ckout << thisIndex << ": syncing" << endl;
685         times.push_back(CkWallTimer()); // 5
686         description.push_back("    sync");
687
688 //         if (do_test) TestResults(0);
689
690         if(verbose) ckout << thisIndex << ": product" << endl;
691
692                 MSA2DRowMjr::Read &r1 = arr1.syncToRead(w1);
693                 MSA2DColMjr::Read &r2 = arr2.syncToRead(w2);
694
695                 hp = &(prod.getInitialWrite());
696                 MSA2DRowMjrC::Write &wp = * (MSA2DRowMjrC::Write *) hp;
697                 MSA2DRowMjrC::Accum &ap = * (MSA2DRowMjrC::Accum *) hp;
698
699         switch(DECOMPOSITION) {
700         case 1:
701             if (runPrefetchVersion)
702                 FindProductWithPrefetch(r1, r2, wp);
703             else
704                 FindProductNoPrefetch(r1, r2, wp);
705             break;
706         case 2:
707             FindProductNoPrefetch2D(r1, r2, wp);
708             break;
709         case 3:
710             FindProductNoPrefetch3D(r1, r2, ap);
711             break;
712         case 4:
713             FindProductNoPrefetchStripMined(wp);
714             break;
715         case 5:
716             FindProductNoPrefetch2DStripMined(wp);
717             break;
718         case 6:
719             FindProductNoPrefetch3DStripMined(ap);
720             break;
721         }
722         times.push_back(CkWallTimer()); // 6
723         description.push_back("    work");
724
725                 h1 = &r1;
726                 h2 = &r2;
727                 hp = &(prod.syncToRead(*hp));
728
729         Contribute();
730     }
731
732     void Kontinue()
733     {
734 //         if (do_test) TestResults(0);
735         times.push_back(CkWallTimer()); // 6
736         description.push_back("    redn");
737
738         if(verbose) ckout << thisIndex << ": testing" << endl;
739         if (do_test) TestResults(*h1, *h2, * (MSA2DRowMjrC::Read *) hp);
740         times.push_back(CkWallTimer()); // 5
741         description.push_back("    test");
742         Contribute();
743
744         if (detailedTimings) {
745             if (thisIndex == 0) {
746                 for(int i=1; i<description.length(); i++)
747                     ckout << description[i] << " ";
748                 ckout << endl;
749             }
750             ckout << "w" << thisIndex << ":";
751             for(int i=1; i<times.length(); i++)
752                 ckout << times[i]-times[i-1] << " ";
753             ckout << endl;
754         }
755     }
756 };
757
758 #include "t2d.def.h"