More unused variable cleanup
[charm.git] / src / ck-core / ckcheckpoint.C
1 /*
2 Charm++ File: Checkpoint Library
3 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu
4
5 More documentation goes here...
6 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu
7     see ckcheckpoint.h for change log
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include "charm++.h"
14 #include "ck.h"
15 #include "ckcheckpoint.h"
16
17 void noopit(const char*, ...)
18 {}
19
20 //#define DEBCHK  // CkPrintf
21 #define DEBCHK noopit
22
23 #define DEBUGC(x) x
24 //#define DEBUGC(x) 
25
26 CkGroupID _sysChkptMgr;
27
28 typedef struct _GroupInfo{
29         CkGroupID gID;
30         int MigCtor, DefCtor;
31         char name[256];
32 } GroupInfo;
33 PUPbytes(GroupInfo)
34 PUPmarshall(GroupInfo)
35
36 int _inrestart = 0;
37 int _restarted = 0;
38 int _oldNumPes = 0;
39 int _chareRestored = 0;
40
41 void CkCreateLocalChare(int epIdx, envelope *env);
42
43 // help class to find how many array elements
44 class ElementCounter : public CkLocIterator {
45 private:
46         int count;
47 public:
48         ElementCounter():count(0){};
49         void addLocation(CkLocation &loc)  { count++; }
50         int getCount() { return count; }
51 };
52
53 // helper class to pup all elements that belong to same ckLocMgr
54 class ElementCheckpointer : public CkLocIterator {
55 private:
56         CkLocMgr *locMgr;
57         PUP::er &p;
58 public:
59         ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
60         void addLocation(CkLocation &loc) {
61                 CkArrayIndex idx=loc.getIndex();
62                 CkGroupID gID = locMgr->ckGetGroupID();
63                 p|gID;      // store loc mgr's GID as well for easier restore
64                 p|idx;
65                 p|loc;
66                 //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
67         }
68 };
69
70
71 extern void _initDone();
72
73 static void bdcastRO(void){
74         int i;
75         //Determine the size of the RODataMessage
76         PUP::sizer ps;
77         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
78
79         //Allocate and fill out the RODataMessage
80         envelope *env = _allocEnv(RODataMsg, ps.size());
81         PUP::toMem pp((char *)EnvToUsr(env));
82         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
83         
84         env->setCount(++_numInitMsgs);
85         env->setSrcPe(CkMyPe());
86         CmiSetHandler(env, _roRestartHandlerIdx);
87         CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
88 }
89
90 // Print out an array index to this string as decimal fields
91 // separated by underscores.
92 void printIndex(const CkArrayIndex &idx,char *dest) {
93         const int *idxData=idx.data();
94         for (int i=0;i<idx.nInts;i++) {
95                 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
96                 dest+=strlen(dest);
97         }
98 }
99
100 static void checkpointOne(const char* dirname, CkCallback& cb);
101
102 // broadcast
103 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
104         chkptStartTimer = CmiWallTimer();
105         // every body make dir in case it is local directory
106         CmiMkdir(dirname);
107
108         if (CkMyPe() == 0) {
109           checkpointOne(dirname, cb);
110         }
111
112         char fileName[1024];
113
114 #ifndef CMK_CHARE_USE_PTR
115         // save groups into Chares.dat
116         sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
117         FILE* fChares = CmiFopen(fileName,"wb");
118         if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
119         PUP::toDisk pChares(fChares);
120         CkPupChareData(pChares);
121         CmiFclose(fChares);
122 #endif
123
124         // save groups into Groups.dat
125         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
126         sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
127         FILE* fGroups = CmiFopen(fileName,"wb");
128         if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
129         PUP::toDisk pGroups(fGroups);
130 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
131     CkPupGroupData(pGroups,CmiTrue);
132 #else
133     CkPupGroupData(pGroups);
134 #endif
135         CmiFclose(fGroups);
136
137         // save nodegroups into NodeGroups.dat
138         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
139         if (CkMyRank() == 0) {
140           sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
141           FILE* fNodeGroups = CmiFopen(fileName,"wb");
142           if(!fNodeGroups) 
143             CkAbort("Failed to create checkpoint file for nodegroup table!");
144           PUP::toDisk pNodeGroups(fNodeGroups);
145 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
146       CkPupNodeGroupData(pNodeGroups,CmiTrue);
147 #else
148       CkPupNodeGroupData(pNodeGroups);
149 #endif
150           CmiFclose(fNodeGroups);
151         }
152
153         //DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
154         sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
155         FILE *datFile=CmiFopen(fileName,"wb");
156         if (datFile==NULL) CkAbort("Could not create data file");
157         PUP::toDisk  p(datFile);
158         CkPupArrayElementsData(p);
159         CmiFclose(datFile);
160
161 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
162         system("sync");
163 #endif
164
165         restartCB = cb;
166         DEBCHK("[%d]restartCB installed\n",CkMyPe());
167         CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
168         contribute(0,NULL,CkReduction::sum_int,localcb);
169 }
170
171 void CkCheckpointMgr::SendRestartCB(CkReductionMsg *m){ 
172         delete m; 
173         DEBCHK("[%d]Sending out the cb\n",CkMyPe());
174         CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
175         restartCB.send(); 
176 }
177
178 void CkPupROData(PUP::er &p)
179 {
180         int _numReadonlies;
181         if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
182         p|_numReadonlies;
183         if (p.isUnpacking()) {
184           if (_numReadonlies != _readonlyTable.size())
185             CkAbort("You cannot add readonlies and restore from checkpoint...");
186         }
187         for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
188 }
189
190 // handle main chare
191 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
192 {
193         int nMains=_mainTable.size();
194         DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
195         for(int i=0;i<nMains;i++){  /* Create all mainchares */
196                 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
197                 int entryMigCtor = entry->getMigCtor();
198                 if(entryMigCtor!=-1) {
199                         Chare* obj;
200                         if (p.isUnpacking()) {
201                                 int size = entry->size;
202                                 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
203                                 obj = (Chare*)malloc(size);
204                                 _MEMCHECK(obj);
205                                 _mainTable[i]->setObj(obj);
206                                 //void *m = CkAllocSysMsg();
207                                 _entryTable[entryMigCtor]->call(args, obj);
208                         }
209                         else 
210                                 obj = (Chare *)_mainTable[i]->getObj();
211                         obj->pup(p);
212                 }
213         }
214         // to update mainchare proxy
215         // only readonly variables of Chare Proxy is taken care of here;
216         // in general, if chare proxy is contained in some data structure
217         // for example CkCallback, it is user's responsibility to
218         // update them after restarting
219         if (p.isUnpacking() && CkMyPe()==0)
220                 bdcastRO();
221 }
222
223 #ifndef CMK_CHARE_USE_PTR
224
225 CkpvExtern(CkVec<void *>, chare_objs);
226 CkpvExtern(CkVec<int>, chare_types);
227 CkpvExtern(CkVec<VidBlock *>, vidblocks);
228
229 // handle plain non-migratable chare
230 void CkPupChareData(PUP::er &p)
231 {
232   int i, n;
233   if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
234   p|n;
235   for (i=0; i<n; i++) {
236         int chare_type;
237         if (!p.isUnpacking()) {
238                 chare_type = CkpvAccess(chare_types)[i];
239         }
240         p | chare_type;
241         if (p.isUnpacking()) {
242                 int migCtor = _chareTable[chare_type]->migCtor;
243                 if(migCtor==-1) {
244                         char buf[512];
245                         sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
246                         CkAbort(buf);
247                 }
248                 void *m = CkAllocSysMsg();
249                 envelope* env = UsrToEnv((CkMessage *)m);
250                 CkCreateLocalChare(migCtor, env);
251                 CkFreeSysMsg(m);
252         }
253         Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
254         obj->pup(p);
255   }
256
257   if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
258   p|n;
259   for (i=0; i<n; i++) {
260         VidBlock *v;
261         if (p.isUnpacking()) {
262                 v = new VidBlock();
263                 CkpvAccess(vidblocks).push_back(v);
264         }
265         else
266                 v = CkpvAccess(vidblocks)[i];
267         v->pup(p);
268   }
269 }
270 #else
271 void CkPupChareData(PUP::er &p)
272 {
273    // not implemented
274 }
275 #endif
276
277 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
278 // handle GroupTable and data
279 void CkPupGroupData(PUP::er &p, CmiBool create)
280 {
281         int numGroups, i;
282
283         if (!p.isUnpacking()) {
284           numGroups = CkpvAccess(_groupIDTable)->size();
285         }
286         p|numGroups;
287         if (p.isUnpacking()) {
288           if(CkMyPe()==0)  
289             CkpvAccess(_numGroups) = numGroups+1; 
290           else 
291             CkpvAccess(_numGroups) = 1;
292         }
293         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
294
295         GroupInfo *tmpInfo = new GroupInfo [numGroups];
296         if (!p.isUnpacking()) {
297           for(i=0;i<numGroups;i++) {
298                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
299                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
300                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
301                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
302                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
303                 //CkPrintf("[%d] CkPupGroupData: %s group %s \n", CkMyPe(), p.typeString(), tmpInfo[i].name);
304
305                 if(tmpInfo[i].MigCtor==-1) {
306                         char buf[512];
307                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
308                         CkAbort(buf);
309                 }
310           }
311         }
312         for (i=0; i<numGroups; i++) p|tmpInfo[i];
313
314         for(i=0;i<numGroups;i++) 
315         {
316           CkGroupID gID = tmpInfo[i].gID;
317           if (p.isUnpacking()) {
318             //CkpvAccess(_groupIDTable)->push_back(gID);
319             int eIdx = tmpInfo[i].MigCtor;
320             // error checking
321             if (eIdx == -1) {
322               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
323             }
324             void *m = CkAllocSysMsg();
325             envelope* env = UsrToEnv((CkMessage *)m);
326                 if(create)
327                     CkCreateLocalGroup(gID, eIdx, env);
328           }   // end of unPacking
329           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
330           // if using migration constructor, you'd better have a pup
331                 if(!create)
332                         gobj->mlogData->teamRecoveryFlag = 1;
333           gobj->pup(p);
334          // CkPrintf("Group PUP'ed: gid = %d, name = %s\n",gobj->ckGetGroupID().idx, tmpInfo[i].name);
335         }
336         delete [] tmpInfo;
337 }
338
339 // handle NodeGroupTable and data
340 void CkPupNodeGroupData(PUP::er &p, CmiBool create)
341 {
342         int numNodeGroups, i;
343         if (!p.isUnpacking()) {
344           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
345         }
346         p|numNodeGroups;
347         if (p.isUnpacking()) {
348           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
349           else { CksvAccess(_numNodeGroups) = 1; }
350         }
351         if(CkMyPe() == 3)
352         CkPrintf("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
353
354         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
355         if (!p.isUnpacking()) {
356           for(i=0;i<numNodeGroups;i++) {
357                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
358                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
359                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
360                 if(tmpInfo[i].MigCtor==-1) {
361                         char buf[512];
362                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
363                                      declared as [migratable] in .ci to be able to checkpoint.",\
364                                      _chareTable[ent2.getcIdx()]->name);
365                         CkAbort(buf);
366                 }
367           }
368         }
369         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
370         for (i=0;i<numNodeGroups;i++) {
371                 CkGroupID gID = tmpInfo[i].gID;
372                 if (p.isUnpacking()) {
373                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
374                         int eIdx = tmpInfo[i].MigCtor;
375                         void *m = CkAllocSysMsg();
376                         envelope* env = UsrToEnv((CkMessage *)m);
377                         if(create){
378                                 CkCreateLocalNodeGroup(gID, eIdx, env);
379                         }
380                 }
381                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
382                 IrrGroup *obj = ent2.getObj();
383                 obj->pup(p);
384                 if(CkMyPe() == 3) CkPrintf("Nodegroup PUP'ed: gid = %d, name = %s\n",
385                         obj->ckGetGroupID().idx,
386                         _chareTable[ent2.getcIdx()]->name);
387         }
388         delete [] tmpInfo;
389 }
390 #else
391 // handle GroupTable and data
392 void CkPupGroupData(PUP::er &p)
393 {
394         int numGroups, i;
395
396         if (!p.isUnpacking()) {
397           numGroups = CkpvAccess(_groupIDTable)->size();
398         }
399         p|numGroups;
400         if (p.isUnpacking()) {
401           if(CkMyPe()==0)  
402             CkpvAccess(_numGroups) = numGroups+1; 
403           else 
404             CkpvAccess(_numGroups) = 1;
405         }
406         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
407
408         GroupInfo *tmpInfo = new GroupInfo [numGroups];
409         if (!p.isUnpacking()) {
410           for(i=0;i<numGroups;i++) {
411                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
412                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
413                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
414                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
415                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
416                 DEBCHK("[%d] CkPupGroupData: %s group %s \n",
417                         CkMyPe(), p.typeString(), tmpInfo[i].name);
418
419                 if(tmpInfo[i].MigCtor==-1) {
420                         char buf[512];
421                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
422                         CkAbort(buf);
423                 }
424           }
425         }
426         for (i=0; i<numGroups; i++) p|tmpInfo[i];
427
428         for(i=0;i<numGroups;i++) 
429         {
430           CkGroupID gID = tmpInfo[i].gID;
431           if (p.isUnpacking()) {
432             //CkpvAccess(_groupIDTable)->push_back(gID);
433             int eIdx = tmpInfo[i].MigCtor;
434             // error checking
435             if (eIdx == -1) {
436               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
437             }
438             void *m = CkAllocSysMsg();
439             envelope* env = UsrToEnv((CkMessage *)m);
440             CkCreateLocalGroup(gID, eIdx, env);
441           }   // end of unPacking
442           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
443           // if using migration constructor, you'd better have a pup
444           gobj->pup(p);
445           DEBCHK("Group PUP'ed: gid = %d, name = %s\n",
446                         gobj->ckGetGroupID().idx, tmpInfo[i].name);
447         }
448         delete [] tmpInfo;
449 }
450
451 // handle NodeGroupTable and data
452 void CkPupNodeGroupData(PUP::er &p)
453 {
454         int numNodeGroups, i;
455         if (!p.isUnpacking()) {
456           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
457         }
458         p|numNodeGroups;
459         if (p.isUnpacking()) {
460           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
461           else { CksvAccess(_numNodeGroups) = 1; }
462         }
463         DEBCHK("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
464
465         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
466         if (!p.isUnpacking()) {
467           for(i=0;i<numNodeGroups;i++) {
468                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
469                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
470                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
471                 if(tmpInfo[i].MigCtor==-1) {
472                         char buf[512];
473                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
474                                      declared as [migratable] in .ci to be able to checkpoint.",\
475                                      _chareTable[ent2.getcIdx()]->name);
476                         CkAbort(buf);
477                 }
478           }
479         }
480         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
481         for (i=0;i<numNodeGroups;i++) {
482                 CkGroupID gID = tmpInfo[i].gID;
483                 if (p.isUnpacking()) {
484                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
485                         int eIdx = tmpInfo[i].MigCtor;
486                         void *m = CkAllocSysMsg();
487                         envelope* env = UsrToEnv((CkMessage *)m);
488                         CkCreateLocalNodeGroup(gID, eIdx, env);
489                 }
490                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
491                 IrrGroup *obj = ent2.getObj();
492                 obj->pup(p);
493                 DEBCHK("Nodegroup PUP'ed: gid = %d, name = %s\n",
494                         obj->ckGetGroupID().idx,
495                         _chareTable[ent2.getcIdx()]->name);
496         }
497         delete [] tmpInfo;
498 }
499 #endif
500
501 // handle chare array elements for this processor
502 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
503 {
504         int i;
505         // safe in both packing/unpakcing at this stage
506         int numGroups = CkpvAccess(_groupIDTable)->size();
507
508         // number of array elements on this processor
509         int numElements;
510         if (!p.isUnpacking()) {
511           ElementCounter  counter;
512           CKLOCMGR_LOOP(mgr->iterate(counter););
513           numElements = counter.getCount();
514         }
515         p|numElements;
516
517         DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
518
519         if (!p.isUnpacking())
520         {
521           // let CkLocMgr to iterate and store every array elements
522           CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
523         }
524         else {
525           // loop and create all array elements ourselves
526           //CkPrintf("total chare array cnts: %d\n", numElements);
527           for (int i=0; i<numElements; i++) {
528                 CkGroupID gID;
529                 CkArrayIndex idx;
530                 p|gID;
531                 p|idx;
532                 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
533                 if (notifyListeners){
534                   mgr->resume(idx,p,CmiTrue);
535                 }
536                 else{
537                   mgr->restore(idx,p);
538                 }
539           }
540         }
541         // finish up
542         if (notifyListeners)
543         for(i=0;i<numGroups;i++) {
544                 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
545                 obj->ckJustMigrated();
546         }
547 }
548
549 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
550 int  CkCountArrayElements(){
551     int numGroups = CkpvAccess(_groupIDTable)->size();
552     int i;
553     ElementCounter  counter;
554     CKLOCMGR_LOOP(mgr->iterate(counter););
555   int numElements = counter.getCount();
556     return numElements;
557 }
558 #endif
559
560 void CkPupProcessorData(PUP::er &p)
561 {
562     // save readonlys, and callback BTW
563     if(CkMyRank()==0) {
564         CkPupROData(p);
565     }
566
567     // save mainchares into MainChares.dat
568     if(CkMyPe()==0) {
569       CkPupMainChareData(p, NULL);
570     }
571         
572     // save non-migratable chare
573     CkPupChareData(p);
574
575     // save groups 
576 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
577     CkPupGroupData(p,CmiTrue);
578 #else
579     CkPupGroupData(p);
580 #endif
581
582     // save nodegroups
583     if(CkMyRank()==0) {
584 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
585         CkPupNodeGroupData(p,CmiTrue);  
586 #else
587         CkPupNodeGroupData(p);
588 #endif
589     }
590
591     // pup array elements
592     CkPupArrayElementsData(p);
593 }
594
595 // called only on pe 0
596 static void checkpointOne(const char* dirname, CkCallback& cb){
597         CmiAssert(CkMyPe()==0);
598         char filename[1024];
599         
600         // save readonlys, and callback BTW
601         sprintf(filename,"%s/RO.dat",dirname);
602         FILE* fRO = CmiFopen(filename,"wb");
603         if(!fRO) CkAbort("Failed to create checkpoint file for readonly data!");
604         PUP::toDisk pRO(fRO);
605         int _numPes = CkNumPes();
606         pRO|_numPes;
607         CkPupROData(pRO);
608         pRO|cb;
609         CmiFclose(fRO);
610
611         // save mainchares into MainChares.dat
612         {
613                 sprintf(filename,"%s/MainChares.dat",dirname);
614                 FILE* fMain = CmiFopen(filename,"wb");
615                 if(!fMain) CkAbort("Failed to open checkpoint file for mainchare data!");
616                 PUP::toDisk pMain(fMain);
617                 CkPupMainChareData(pMain, NULL);
618                 CmiFclose(fMain);
619         }
620 }
621
622 void CkRemoveArrayElements()
623 {
624   int i;
625   int numGroups = CkpvAccess(_groupIDTable)->size();
626   CKLOCMGR_LOOP(mgr->flushAllRecs(););
627 /*  GroupTable *gTbl = CkpvAccess(_groupTable);
628   for(i=0; i<numGroups; i++){
629     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
630     if(obj->isLocMgr()) {
631         CkLocMgr *mgr = (CkLocMgr *)obj;
632         mgr->flushAllRecs();
633     }
634   }*/
635 }
636
637 /*
638 void CkTestArrayElements()
639 {
640   int i;
641   int numGroups = CkpvAccess(_groupIDTable)->size();
642   //CKLOCMGR_LOOP(mgr->flushAllRecs(););
643   GroupTable *gTbl = CkpvAccess(_groupTable);
644   for(i=0; i<numGroups; i++){
645     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
646     CkPrintf("An object at [%d]: %p | isLocMgr: %d\n", i, obj, obj->isLocMgr());
647   }
648 }
649 */
650
651 void CkStartCheckpoint(const char* dirname,const CkCallback& cb)
652 {
653         CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
654         
655         // hand over to checkpoint managers for per-processor checkpointing
656         CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb);
657 }
658
659 /**
660   * Restart: There's no such object as restart manager is created
661   *          because a group cannot restore itself anyway.
662   *          The mechanism exists as converse code and get invoked by
663   *          broadcast message.
664   **/
665
666 void CkRestartMain(const char* dirname, CkArgMsg *args){
667         int i;
668         char filename[1024];
669         CkCallback cb;
670         
671         _inrestart = 1;
672         _restarted = 1;
673
674         // restore readonlys
675         sprintf(filename,"%s/RO.dat",dirname);
676         FILE* fRO = CmiFopen(filename,"rb");
677         if(!fRO) CkAbort("Failed to open checkpoint file for readonly data!");
678         int _numPes = -1;
679         PUP::fromDisk pRO(fRO);
680         pRO|_numPes;
681         CkPupROData(pRO);
682         pRO|cb;
683         CmiFclose(fRO);
684         DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
685         _oldNumPes = _numPes;
686
687         CmiNodeBarrier();
688
689         // restore mainchares
690         sprintf(filename,"%s/MainChares.dat",dirname);
691         FILE* fMain = CmiFopen(filename,"rb");
692         if(fMain && CkMyPe()==0){ // only main chares have been checkpointed, we restart on PE0
693                 PUP::fromDisk pMain(fMain);
694                 CkPupMainChareData(pMain, args);
695                 CmiFclose(fMain);
696                 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
697                 //bdcastRO(); // moved to CkPupMainChareData()
698         }
699         
700 #ifndef CMK_CHARE_USE_PTR
701         // restore chares only when number of pes is the same 
702         if(CkNumPes() == _numPes) {
703                 sprintf(filename,"%s/Chares_%d.dat",dirname,CkMyPe());
704                 FILE* fChares = CmiFopen(filename,"rb");
705                 if(!fChares) CkAbort("Failed to open checkpoint file for chares!");
706                 PUP::fromDisk pChares(fChares);
707                 CkPupChareData(pChares);
708                 CmiFclose(fChares);
709                 _chareRestored = 1;
710         }
711 #endif
712
713         // restore groups
714         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
715         // restore from PE0's copy if shrink/expand
716         if(CkNumPes() != _numPes)
717                 sprintf(filename,"%s/Groups_0.dat",dirname);
718         else
719                 sprintf(filename,"%s/Groups_%d.dat",dirname,CkMyPe());
720         FILE* fGroups = CmiFopen(filename,"rb");
721         if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
722         PUP::fromDisk pGroups(fGroups);
723 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
724     CkPupGroupData(pGroups,CmiTrue);
725 #else
726     CkPupGroupData(pGroups);
727 #endif
728         CmiFclose(fGroups);
729
730         // restore nodegroups
731         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
732         if(CkMyRank()==0){
733                 if(CkNumPes() != _numPes)
734                         sprintf(filename,"%s/NodeGroups_0.dat",dirname);
735                 else
736                         sprintf(filename,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
737                 FILE* fNodeGroups = CmiFopen(filename,"rb");
738                 if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
739                 PUP::fromDisk pNodeGroups(fNodeGroups);
740 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
741         CkPupNodeGroupData(pNodeGroups,CmiTrue);
742 #else
743         CkPupNodeGroupData(pNodeGroups);
744 #endif
745                 CmiFclose(fNodeGroups);
746         }
747
748         // for each location, restore arrays
749         //DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
750         DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
751         if(CkMyPe() < _numPes)  // in normal range: restore, otherwise, do nothing
752           for (i=0; i<_numPes;i++) {
753             if (i%CkNumPes() == CkMyPe()) {
754               sprintf(filename,"%s/arr_%d.dat",dirname, i);
755               FILE *datFile=CmiFopen(filename,"rb");
756               if (datFile==NULL) CkAbort("Could not read data file");
757               PUP::fromDisk  p(datFile);
758               CkPupArrayElementsData(p);
759               CmiFclose(datFile);
760             }
761           }
762
763         _inrestart = 0;
764
765         _initDone();
766
767         if(CkMyPe()==0) {
768                 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
769                 cb.send();
770         }
771 }
772
773 // Main chare: initialize system checkpoint manager
774 class CkCheckpointInit : public Chare {
775 public:
776   CkCheckpointInit(CkArgMsg *msg) {
777     _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
778     delete msg;
779   }
780   CkCheckpointInit(CkMigrateMessage *m) {delete m;}
781 };
782
783 #include "CkCheckpoint.def.h"
784