e3b513b96cdfb1ce847f63bb582b36c098e86f54
[charm.git] / src / ck-core / ckcheckpoint.C
1 /*
2 Charm++ File: Checkpoint Library
3 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu
4
5 More documentation goes here...
6 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu
7     see ckcheckpoint.h for change log
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include "charm++.h"
14 #include "ck.h"
15 #include "ckcheckpoint.h"
16
17 void noopit(const char*, ...)
18 {}
19
20 //#define DEBCHK   CkPrintf
21 #define DEBCHK noopit
22
23 #define DEBUGC(x) x
24 //#define DEBUGC(x) 
25
26 CkGroupID _sysChkptMgr;
27
28 typedef struct _GroupInfo{
29         CkGroupID gID;
30         int MigCtor, DefCtor;
31         char name[256];
32 } GroupInfo;
33 PUPbytes(GroupInfo)
34 PUPmarshall(GroupInfo)
35
36 int _inrestart = 0;
37 int _restarted = 0;
38 int _oldNumPes = 0;
39 int _chareRestored = 0;
40
41 void CkCreateLocalChare(int epIdx, envelope *env);
42
43 // help class to find how many array elements
44 class ElementCounter : public CkLocIterator {
45 private:
46         int count;
47 public:
48         ElementCounter():count(0){};
49         void addLocation(CkLocation &loc)  { count++; }
50         int getCount() { return count; }
51 };
52
53 // helper class to pup all elements that belong to same ckLocMgr
54 class ElementCheckpointer : public CkLocIterator {
55 private:
56         CkLocMgr *locMgr;
57         PUP::er &p;
58 public:
59         ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
60         void addLocation(CkLocation &loc) {
61                 CkArrayIndex idx=loc.getIndex();
62                 CkGroupID gID = locMgr->ckGetGroupID();
63                 p|gID;      // store loc mgr's GID as well for easier restore
64                 p|idx;
65                 p|loc;
66                 //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
67         }
68 };
69
70
71 extern void _initDone();
72
73 static void bdcastRO(void){
74         int i;
75         //Determine the size of the RODataMessage
76         PUP::sizer ps;
77         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
78
79         //Allocate and fill out the RODataMessage
80         envelope *env = _allocEnv(RODataMsg, ps.size());
81         PUP::toMem pp((char *)EnvToUsr(env));
82         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
83         
84         env->setCount(++_numInitMsgs);
85         env->setSrcPe(CkMyPe());
86         CmiSetHandler(env, _roRestartHandlerIdx);
87         CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
88 }
89
90 // Print out an array index to this string as decimal fields
91 // separated by underscores.
92 void printIndex(const CkArrayIndex &idx,char *dest) {
93         const int *idxData=idx.data();
94         for (int i=0;i<idx.nInts;i++) {
95                 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
96                 dest+=strlen(dest);
97         }
98 }
99
100 static void checkpointOne(const char* dirname, CkCallback& cb);
101
102 // broadcast
103 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
104         chkptStartTimer = CmiWallTimer();
105         // every body make dir in case it is local directory
106         CmiMkdir(dirname);
107
108         if (CkMyPe() == 0) {
109           checkpointOne(dirname, cb);
110         }
111
112         char fileName[1024];
113
114 #ifndef CMK_CHARE_USE_PTR
115         // save groups into Chares.dat
116         sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
117         FILE* fChares = CmiFopen(fileName,"wb");
118         if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
119         PUP::toDisk pChares(fChares);
120         CkPupChareData(pChares);
121         CmiFclose(fChares);
122 #endif
123
124         // save groups into Groups.dat
125         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
126         sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
127         FILE* fGroups = CmiFopen(fileName,"wb");
128         if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
129         PUP::toDisk pGroups(fGroups);
130 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
131     CkPupGroupData(pGroups,CmiTrue);
132 #else
133     CkPupGroupData(pGroups);
134 #endif
135         CmiFclose(fGroups);
136
137         // save nodegroups into NodeGroups.dat
138         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
139         if (CkMyRank() == 0) {
140           sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
141           FILE* fNodeGroups = CmiFopen(fileName,"wb");
142           if(!fNodeGroups) 
143             CkAbort("Failed to create checkpoint file for nodegroup table!");
144           PUP::toDisk pNodeGroups(fNodeGroups);
145 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
146       CkPupNodeGroupData(pNodeGroups,CmiTrue);
147 #else
148       CkPupNodeGroupData(pNodeGroups);
149 #endif
150           CmiFclose(fNodeGroups);
151         }
152
153         //DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
154         sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
155         FILE *datFile=CmiFopen(fileName,"wb");
156         if (datFile==NULL) CkAbort("Could not create data file");
157         PUP::toDisk  p(datFile);
158         CkPupArrayElementsData(p);
159         CmiFclose(datFile);
160
161 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
162         system("sync");
163 #endif
164
165         restartCB = cb;
166         DEBCHK("[%d]restartCB installed\n",CkMyPe());
167         CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
168         //contribute(0,NULL,CkReduction::sum_int,localcb);
169         barrier(localcb);
170 }
171
172 void CkCheckpointMgr::SendRestartCB(CkReductionMsg *m){ 
173         delete m; 
174         DEBCHK("[%d]Sending out the cb\n",CkMyPe());
175         CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
176         restartCB.send(); 
177 }
178
179 void CkPupROData(PUP::er &p)
180 {
181         int _numReadonlies;
182         if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
183         p|_numReadonlies;
184         if (p.isUnpacking()) {
185           if (_numReadonlies != _readonlyTable.size())
186             CkAbort("You cannot add readonlies and restore from checkpoint...");
187         }
188         for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
189 }
190
191 // handle main chare
192 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
193 {
194         int nMains=_mainTable.size();
195         DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
196         for(int i=0;i<nMains;i++){  /* Create all mainchares */
197                 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
198                 int entryMigCtor = entry->getMigCtor();
199                 if(entryMigCtor!=-1) {
200                         Chare* obj;
201                         if (p.isUnpacking()) {
202                                 int size = entry->size;
203                                 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
204                                 obj = (Chare*)malloc(size);
205                                 _MEMCHECK(obj);
206                                 _mainTable[i]->setObj(obj);
207                                 //void *m = CkAllocSysMsg();
208                                 _entryTable[entryMigCtor]->call(args, obj);
209                         }
210                         else 
211                                 obj = (Chare *)_mainTable[i]->getObj();
212                         obj->pup(p);
213                 }
214         }
215         // to update mainchare proxy
216         // only readonly variables of Chare Proxy is taken care of here;
217         // in general, if chare proxy is contained in some data structure
218         // for example CkCallback, it is user's responsibility to
219         // update them after restarting
220         if (p.isUnpacking() && CkMyPe()==0)
221                 bdcastRO();
222 }
223
224 #ifndef CMK_CHARE_USE_PTR
225
226 CkpvExtern(CkVec<void *>, chare_objs);
227 CkpvExtern(CkVec<int>, chare_types);
228 CkpvExtern(CkVec<VidBlock *>, vidblocks);
229
230 // handle plain non-migratable chare
231 void CkPupChareData(PUP::er &p)
232 {
233   int i, n;
234   if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
235   p|n;
236   for (i=0; i<n; i++) {
237         int chare_type;
238         if (!p.isUnpacking()) {
239                 chare_type = CkpvAccess(chare_types)[i];
240         }
241         p | chare_type;
242         if (p.isUnpacking()) {
243                 int migCtor = _chareTable[chare_type]->migCtor;
244                 if(migCtor==-1) {
245                         char buf[512];
246                         sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
247                         CkAbort(buf);
248                 }
249                 void *m = CkAllocSysMsg();
250                 envelope* env = UsrToEnv((CkMessage *)m);
251                 CkCreateLocalChare(migCtor, env);
252                 CkFreeSysMsg(m);
253         }
254         Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
255         obj->pup(p);
256   }
257
258   if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
259   p|n;
260   for (i=0; i<n; i++) {
261         VidBlock *v;
262         if (p.isUnpacking()) {
263                 v = new VidBlock();
264                 CkpvAccess(vidblocks).push_back(v);
265         }
266         else
267                 v = CkpvAccess(vidblocks)[i];
268         v->pup(p);
269   }
270 }
271 #else
272 void CkPupChareData(PUP::er &p)
273 {
274    // not implemented
275 }
276 #endif
277
278 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
279 // handle GroupTable and data
280 void CkPupGroupData(PUP::er &p, CmiBool create)
281 {
282         int numGroups, i;
283
284         if (!p.isUnpacking()) {
285           numGroups = CkpvAccess(_groupIDTable)->size();
286         }
287         p|numGroups;
288         if (p.isUnpacking()) {
289           if(CkMyPe()==0)  
290             CkpvAccess(_numGroups) = numGroups+1; 
291           else 
292             CkpvAccess(_numGroups) = 1;
293         }
294         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
295
296         GroupInfo *tmpInfo = new GroupInfo [numGroups];
297         if (!p.isUnpacking()) {
298           for(i=0;i<numGroups;i++) {
299                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
300                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
301                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
302                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
303                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
304                 //CkPrintf("[%d] CkPupGroupData: %s group %s \n", CkMyPe(), p.typeString(), tmpInfo[i].name);
305
306                 if(tmpInfo[i].MigCtor==-1) {
307                         char buf[512];
308                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
309                         CkAbort(buf);
310                 }
311           }
312         }
313         for (i=0; i<numGroups; i++) p|tmpInfo[i];
314
315         for(i=0;i<numGroups;i++) 
316         {
317           CkGroupID gID = tmpInfo[i].gID;
318           if (p.isUnpacking()) {
319             //CkpvAccess(_groupIDTable)->push_back(gID);
320             int eIdx = tmpInfo[i].MigCtor;
321             // error checking
322             if (eIdx == -1) {
323               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
324             }
325             void *m = CkAllocSysMsg();
326             envelope* env = UsrToEnv((CkMessage *)m);
327                 if(create)
328                     CkCreateLocalGroup(gID, eIdx, env);
329           }   // end of unPacking
330           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
331           // if using migration constructor, you'd better have a pup
332                 if(!create)
333                         gobj->mlogData->teamRecoveryFlag = 1;
334           gobj->pup(p);
335          // CkPrintf("Group PUP'ed: gid = %d, name = %s\n",gobj->ckGetGroupID().idx, tmpInfo[i].name);
336         }
337         delete [] tmpInfo;
338 }
339
340 // handle NodeGroupTable and data
341 void CkPupNodeGroupData(PUP::er &p, CmiBool create)
342 {
343         int numNodeGroups, i;
344         if (!p.isUnpacking()) {
345           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
346         }
347         p|numNodeGroups;
348         if (p.isUnpacking()) {
349           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
350           else { CksvAccess(_numNodeGroups) = 1; }
351         }
352
353         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
354         if (!p.isUnpacking()) {
355           for(i=0;i<numNodeGroups;i++) {
356                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
357                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
358                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
359                 if(tmpInfo[i].MigCtor==-1) {
360                         char buf[512];
361                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
362                                      declared as [migratable] in .ci to be able to checkpoint.",\
363                                      _chareTable[ent2.getcIdx()]->name);
364                         CkAbort(buf);
365                 }
366           }
367         }
368         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
369         for (i=0;i<numNodeGroups;i++) {
370                 CkGroupID gID = tmpInfo[i].gID;
371                 if (p.isUnpacking()) {
372                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
373                         int eIdx = tmpInfo[i].MigCtor;
374                         void *m = CkAllocSysMsg();
375                         envelope* env = UsrToEnv((CkMessage *)m);
376                         if(create){
377                                 CkCreateLocalNodeGroup(gID, eIdx, env);
378                         }
379                 }
380                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
381                 IrrGroup *obj = ent2.getObj();
382                 obj->pup(p);
383         }
384         delete [] tmpInfo;
385 }
386 #else
387 // handle GroupTable and data
388 void CkPupGroupData(PUP::er &p)
389 {
390         int numGroups, i;
391
392         if (!p.isUnpacking()) {
393           numGroups = CkpvAccess(_groupIDTable)->size();
394         }
395         p|numGroups;
396         if (p.isUnpacking()) {
397           if(CkMyPe()==0)  
398             CkpvAccess(_numGroups) = numGroups+1; 
399           else 
400             CkpvAccess(_numGroups) = 1;
401         }
402         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
403
404         GroupInfo *tmpInfo = new GroupInfo [numGroups];
405         if (!p.isUnpacking()) {
406           for(i=0;i<numGroups;i++) {
407                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
408                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
409                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
410                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
411                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
412                 DEBCHK("[%d] CkPupGroupData: %s group %s \n",
413                         CkMyPe(), p.typeString(), tmpInfo[i].name);
414
415                 if(tmpInfo[i].MigCtor==-1) {
416                         char buf[512];
417                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
418                         CkAbort(buf);
419                 }
420           }
421         }
422         for (i=0; i<numGroups; i++) p|tmpInfo[i];
423
424         for(i=0;i<numGroups;i++) 
425         {
426           CkGroupID gID = tmpInfo[i].gID;
427           if (p.isUnpacking()) {
428             //CkpvAccess(_groupIDTable)->push_back(gID);
429             int eIdx = tmpInfo[i].MigCtor;
430             // error checking
431             if (eIdx == -1) {
432               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
433             }
434             void *m = CkAllocSysMsg();
435             envelope* env = UsrToEnv((CkMessage *)m);
436             CkCreateLocalGroup(gID, eIdx, env);
437           }   // end of unPacking
438           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
439           // if using migration constructor, you'd better have a pup
440           gobj->pup(p);
441           DEBCHK("Group PUP'ed: gid = %d, name = %s\n",
442                         gobj->ckGetGroupID().idx, tmpInfo[i].name);
443         }
444         delete [] tmpInfo;
445 }
446
447 // handle NodeGroupTable and data
448 void CkPupNodeGroupData(PUP::er &p)
449 {
450         int numNodeGroups, i;
451         if (!p.isUnpacking()) {
452           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
453         }
454         p|numNodeGroups;
455         if (p.isUnpacking()) {
456           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
457           else { CksvAccess(_numNodeGroups) = 1; }
458         }
459         DEBCHK("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
460
461         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
462         if (!p.isUnpacking()) {
463           for(i=0;i<numNodeGroups;i++) {
464                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
465                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
466                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
467                 if(tmpInfo[i].MigCtor==-1) {
468                         char buf[512];
469                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
470                                      declared as [migratable] in .ci to be able to checkpoint.",\
471                                      _chareTable[ent2.getcIdx()]->name);
472                         CkAbort(buf);
473                 }
474           }
475         }
476         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
477         for (i=0;i<numNodeGroups;i++) {
478                 CkGroupID gID = tmpInfo[i].gID;
479                 if (p.isUnpacking()) {
480                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
481                         int eIdx = tmpInfo[i].MigCtor;
482                         void *m = CkAllocSysMsg();
483                         envelope* env = UsrToEnv((CkMessage *)m);
484                         CkCreateLocalNodeGroup(gID, eIdx, env);
485                 }
486                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
487                 IrrGroup *obj = ent2.getObj();
488                 obj->pup(p);
489                 DEBCHK("Nodegroup PUP'ed: gid = %d, name = %s\n",
490                         obj->ckGetGroupID().idx,
491                         _chareTable[ent2.getcIdx()]->name);
492         }
493         delete [] tmpInfo;
494 }
495 #endif
496
497 // handle chare array elements for this processor
498 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
499 {
500         int i;
501         // safe in both packing/unpakcing at this stage
502         int numGroups = CkpvAccess(_groupIDTable)->size();
503
504         // number of array elements on this processor
505         int numElements;
506         if (!p.isUnpacking()) {
507           ElementCounter  counter;
508           CKLOCMGR_LOOP(mgr->iterate(counter););
509           numElements = counter.getCount();
510         }
511         p|numElements;
512
513         DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
514
515         if (!p.isUnpacking())
516         {
517           // let CkLocMgr to iterate and store every array elements
518           CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
519         }
520         else {
521           // loop and create all array elements ourselves
522           //CkPrintf("total chare array cnts: %d\n", numElements);
523           for (int i=0; i<numElements; i++) {
524                 CkGroupID gID;
525                 CkArrayIndex idx;
526                 p|gID;
527                 p|idx;
528                 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
529                 if (notifyListeners){
530                   mgr->resume(idx,p,CmiTrue);
531                 }
532                 else{
533                   mgr->restore(idx,p);
534                 }
535           }
536         }
537         // finish up
538         if (notifyListeners)
539         for(i=0;i<numGroups;i++) {
540                 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
541                 obj->ckJustMigrated();
542         }
543 }
544
545 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) ||CMK_MEM_CHECKPOINT
546 int  CkCountArrayElements(){
547     int numGroups = CkpvAccess(_groupIDTable)->size();
548     int i;
549     ElementCounter  counter;
550     CKLOCMGR_LOOP(mgr->iterate(counter););
551   int numElements = counter.getCount();
552     return numElements;
553 }
554 #endif
555
556 void CkPupProcessorData(PUP::er &p)
557 {
558     // save readonlys, and callback BTW
559     if(CkMyRank()==0) {
560         CkPupROData(p);
561     }
562
563     // save mainchares into MainChares.dat
564     if(CkMyPe()==0) {
565       CkPupMainChareData(p, NULL);
566     }
567         
568     // save non-migratable chare
569     CkPupChareData(p);
570
571     // save groups 
572 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
573     CkPupGroupData(p,CmiTrue);
574 #else
575     CkPupGroupData(p);
576 #endif
577
578     // save nodegroups
579     if(CkMyRank()==0) {
580 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
581         CkPupNodeGroupData(p,CmiTrue);  
582 #else
583         CkPupNodeGroupData(p);
584 #endif
585     }
586
587     // pup array elements
588     CkPupArrayElementsData(p);
589 }
590
591 // called only on pe 0
592 static void checkpointOne(const char* dirname, CkCallback& cb){
593         CmiAssert(CkMyPe()==0);
594         char filename[1024];
595         
596         // save readonlys, and callback BTW
597         sprintf(filename,"%s/RO.dat",dirname);
598         FILE* fRO = CmiFopen(filename,"wb");
599         if(!fRO) CkAbort("Failed to create checkpoint file for readonly data!");
600         PUP::toDisk pRO(fRO);
601         int _numPes = CkNumPes();
602         pRO|_numPes;
603         CkPupROData(pRO);
604         pRO|cb;
605         CmiFclose(fRO);
606
607         // save mainchares into MainChares.dat
608         {
609                 sprintf(filename,"%s/MainChares.dat",dirname);
610                 FILE* fMain = CmiFopen(filename,"wb");
611                 if(!fMain) CkAbort("Failed to open checkpoint file for mainchare data!");
612                 PUP::toDisk pMain(fMain);
613                 CkPupMainChareData(pMain, NULL);
614                 CmiFclose(fMain);
615         }
616 }
617
618 void CkRemoveArrayElements()
619 {
620   int i;
621   int numGroups = CkpvAccess(_groupIDTable)->size();
622   CKLOCMGR_LOOP(mgr->flushAllRecs(););
623 /*  GroupTable *gTbl = CkpvAccess(_groupTable);
624   for(i=0; i<numGroups; i++){
625     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
626     if(obj->isLocMgr()) {
627         CkLocMgr *mgr = (CkLocMgr *)obj;
628         mgr->flushAllRecs();
629     }
630   }*/
631 }
632
633 /*
634 void CkTestArrayElements()
635 {
636   int i;
637   int numGroups = CkpvAccess(_groupIDTable)->size();
638   //CKLOCMGR_LOOP(mgr->flushAllRecs(););
639   GroupTable *gTbl = CkpvAccess(_groupTable);
640   for(i=0; i<numGroups; i++){
641     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
642     CkPrintf("An object at [%d]: %p | isLocMgr: %d\n", i, obj, obj->isLocMgr());
643   }
644 }
645 */
646
647 void CkStartCheckpoint(const char* dirname,const CkCallback& cb)
648 {
649         CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
650         
651         // hand over to checkpoint managers for per-processor checkpointing
652         CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb);
653 }
654
655 /**
656   * Restart: There's no such object as restart manager is created
657   *          because a group cannot restore itself anyway.
658   *          The mechanism exists as converse code and get invoked by
659   *          broadcast message.
660   **/
661
662 void CkRestartMain(const char* dirname, CkArgMsg *args){
663         int i;
664         char filename[1024];
665         CkCallback cb;
666         
667         _inrestart = 1;
668         _restarted = 1;
669         CkMemCheckPT::inRestarting = 1;
670
671         // restore readonlys
672         sprintf(filename,"%s/RO.dat",dirname);
673         FILE* fRO = CmiFopen(filename,"rb");
674         if(!fRO) CkAbort("Failed to open checkpoint file for readonly data!");
675         int _numPes = -1;
676         PUP::fromDisk pRO(fRO);
677         pRO|_numPes;
678         CkPupROData(pRO);
679         pRO|cb;
680         CmiFclose(fRO);
681         DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
682         _oldNumPes = _numPes;
683
684         CmiNodeBarrier();
685
686         // restore mainchares
687         sprintf(filename,"%s/MainChares.dat",dirname);
688         FILE* fMain = CmiFopen(filename,"rb");
689         if(fMain && CkMyPe()==0){ // only main chares have been checkpointed, we restart on PE0
690                 PUP::fromDisk pMain(fMain);
691                 CkPupMainChareData(pMain, args);
692                 CmiFclose(fMain);
693                 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
694                 //bdcastRO(); // moved to CkPupMainChareData()
695         }
696         
697 #ifndef CMK_CHARE_USE_PTR
698         // restore chares only when number of pes is the same 
699         if(CkNumPes() == _numPes) {
700                 sprintf(filename,"%s/Chares_%d.dat",dirname,CkMyPe());
701                 FILE* fChares = CmiFopen(filename,"rb");
702                 if(!fChares) CkAbort("Failed to open checkpoint file for chares!");
703                 PUP::fromDisk pChares(fChares);
704                 CkPupChareData(pChares);
705                 CmiFclose(fChares);
706                 _chareRestored = 1;
707         }
708 #endif
709
710         // restore groups
711         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
712         // restore from PE0's copy if shrink/expand
713         if(CkNumPes() != _numPes)
714                 sprintf(filename,"%s/Groups_0.dat",dirname);
715         else
716                 sprintf(filename,"%s/Groups_%d.dat",dirname,CkMyPe());
717         FILE* fGroups = CmiFopen(filename,"rb");
718         if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
719         PUP::fromDisk pGroups(fGroups);
720 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
721     CkPupGroupData(pGroups,CmiTrue);
722 #else
723     CkPupGroupData(pGroups);
724 #endif
725         CmiFclose(fGroups);
726
727         // restore nodegroups
728         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
729         if(CkMyRank()==0){
730                 if(CkNumPes() != _numPes)
731                         sprintf(filename,"%s/NodeGroups_0.dat",dirname);
732                 else
733                         sprintf(filename,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
734                 FILE* fNodeGroups = CmiFopen(filename,"rb");
735                 if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
736                 PUP::fromDisk pNodeGroups(fNodeGroups);
737 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
738         CkPupNodeGroupData(pNodeGroups,CmiTrue);
739 #else
740         CkPupNodeGroupData(pNodeGroups);
741 #endif
742                 CmiFclose(fNodeGroups);
743         }
744
745         // for each location, restore arrays
746         //DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
747         DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
748         if(CkMyPe() < _numPes)  // in normal range: restore, otherwise, do nothing
749           for (i=0; i<_numPes;i++) {
750             if (i%CkNumPes() == CkMyPe()) {
751               sprintf(filename,"%s/arr_%d.dat",dirname, i);
752               FILE *datFile=CmiFopen(filename,"rb");
753               if (datFile==NULL) CkAbort("Could not read data file");
754               PUP::fromDisk  p(datFile);
755               CkPupArrayElementsData(p);
756               CmiFclose(datFile);
757             }
758           }
759
760         _inrestart = 0;
761
762         _initDone();
763         CkMemCheckPT::inRestarting = 0;
764         if(CkMyPe()==0) {
765                 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
766                 
767                 cb.send();
768         }
769 }
770
771 // Main chare: initialize system checkpoint manager
772 class CkCheckpointInit : public Chare {
773 public:
774   CkCheckpointInit(CkArgMsg *msg) {
775     _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
776     delete msg;
777   }
778   CkCheckpointInit(CkMigrateMessage *m) {delete m;}
779 };
780
781 #include "CkCheckpoint.def.h"
782