recover from soft failure
[charm.git] / src / ck-core / ckcheckpoint.C
1 /*
2 Charm++ File: Checkpoint Library
3 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu
4
5 More documentation goes here...
6 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu
7     see ckcheckpoint.h for change log
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include "charm++.h"
14 #include "ck.h"
15 #include "ckcheckpoint.h"
16
17 void noopit(const char*, ...)
18 {}
19
20 //#define DEBCHK   CkPrintf
21 #define DEBCHK noopit
22
23 #define DEBUGC(x) x
24 //#define DEBUGC(x) 
25
26 CkGroupID _sysChkptMgr;
27
28 typedef struct _GroupInfo{
29         CkGroupID gID;
30         int MigCtor, DefCtor;
31         char name[256];
32 } GroupInfo;
33 PUPbytes(GroupInfo)
34 PUPmarshall(GroupInfo)
35
36 int _inrestart = 0;
37 int _restarted = 0;
38 int _oldNumPes = 0;
39 int _chareRestored = 0;
40
41 void CkCreateLocalChare(int epIdx, envelope *env);
42
43 // help class to find how many array elements
44 class ElementCounter : public CkLocIterator {
45 private:
46         int count;
47 public:
48         ElementCounter():count(0){};
49         void addLocation(CkLocation &loc)  { count++; }
50         int getCount() { return count; }
51 };
52
53 // helper class to pup all elements that belong to same ckLocMgr
54 class ElementCheckpointer : public CkLocIterator {
55 private:
56         CkLocMgr *locMgr;
57         PUP::er &p;
58 public:
59         ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
60         void addLocation(CkLocation &loc) {
61                 CkArrayIndex idx=loc.getIndex();
62                 CkGroupID gID = locMgr->ckGetGroupID();
63                 p|gID;      // store loc mgr's GID as well for easier restore
64                 p|idx;
65                 p|loc;
66                 //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
67         }
68 };
69
70
71 extern void _initDone();
72
73 static void bdcastRO(void){
74         int i;
75         //Determine the size of the RODataMessage
76         PUP::sizer ps;
77         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
78
79         //Allocate and fill out the RODataMessage
80         envelope *env = _allocEnv(RODataMsg, ps.size());
81         PUP::toMem pp((char *)EnvToUsr(env));
82         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
83         
84         env->setCount(++_numInitMsgs);
85         env->setSrcPe(CkMyPe());
86         CmiSetHandler(env, _roRestartHandlerIdx);
87         CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
88 }
89
90 // Print out an array index to this string as decimal fields
91 // separated by underscores.
92 void printIndex(const CkArrayIndex &idx,char *dest) {
93         const int *idxData=idx.data();
94         for (int i=0;i<idx.nInts;i++) {
95                 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
96                 dest+=strlen(dest);
97         }
98 }
99
100 static void checkpointOne(const char* dirname, CkCallback& cb);
101
102 // broadcast
103 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
104         chkptStartTimer = CmiWallTimer();
105         // every body make dir in case it is local directory
106         CmiMkdir(dirname);
107
108         if (CkMyPe() == 0) {
109           checkpointOne(dirname, cb);
110         }
111
112         char fileName[1024];
113
114 #ifndef CMK_CHARE_USE_PTR
115         // save groups into Chares.dat
116         sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
117         FILE* fChares = CmiFopen(fileName,"wb");
118         if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
119         PUP::toDisk pChares(fChares);
120         CkPupChareData(pChares);
121         CmiFclose(fChares);
122 #endif
123
124         // save groups into Groups.dat
125         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
126         sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
127         FILE* fGroups = CmiFopen(fileName,"wb");
128         if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
129         PUP::toDisk pGroups(fGroups);
130     CkPupGroupData(pGroups,CmiTrue);
131         CmiFclose(fGroups);
132
133         // save nodegroups into NodeGroups.dat
134         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
135         if (CkMyRank() == 0) {
136           sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
137           FILE* fNodeGroups = CmiFopen(fileName,"wb");
138           if(!fNodeGroups) 
139             CkAbort("Failed to create checkpoint file for nodegroup table!");
140           PUP::toDisk pNodeGroups(fNodeGroups);
141       CkPupNodeGroupData(pNodeGroups,CmiTrue);
142           CmiFclose(fNodeGroups);
143         }
144
145         //DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
146         sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
147         FILE *datFile=CmiFopen(fileName,"wb");
148         if (datFile==NULL) CkAbort("Could not create data file");
149         PUP::toDisk  p(datFile);
150         CkPupArrayElementsData(p);
151         CmiFclose(datFile);
152
153 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
154         system("sync");
155 #endif
156
157         restartCB = cb;
158         DEBCHK("[%d]restartCB installed\n",CkMyPe());
159         CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
160         //contribute(0,NULL,CkReduction::sum_int,localcb);
161         barrier(localcb);
162 }
163
164 void CkCheckpointMgr::SendRestartCB(CkReductionMsg *m){ 
165         delete m; 
166         DEBCHK("[%d]Sending out the cb\n",CkMyPe());
167         CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
168         restartCB.send(); 
169 }
170
171 void CkPupROData(PUP::er &p)
172 {
173         int _numReadonlies;
174         if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
175         p|_numReadonlies;
176         if (p.isUnpacking()) {
177           if (_numReadonlies != _readonlyTable.size())
178             CkAbort("You cannot add readonlies and restore from checkpoint...");
179         }
180         for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
181 }
182
183 // handle main chare
184 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
185 {
186         int nMains=_mainTable.size();
187         DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
188         for(int i=0;i<nMains;i++){  /* Create all mainchares */
189                 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
190                 int entryMigCtor = entry->getMigCtor();
191                 if(entryMigCtor!=-1) {
192                         Chare* obj;
193                         if (p.isUnpacking()) {
194                                 int size = entry->size;
195                                 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
196                                 obj = (Chare*)malloc(size);
197                                 _MEMCHECK(obj);
198                                 _mainTable[i]->setObj(obj);
199                                 //void *m = CkAllocSysMsg();
200                                 _entryTable[entryMigCtor]->call(args, obj);
201                         }
202                         else 
203                                 obj = (Chare *)_mainTable[i]->getObj();
204                         obj->pup(p);
205                 }
206         }
207         // to update mainchare proxy
208         // only readonly variables of Chare Proxy is taken care of here;
209         // in general, if chare proxy is contained in some data structure
210         // for example CkCallback, it is user's responsibility to
211         // update them after restarting
212         if (p.isUnpacking() && CkMyPe()==0)
213                 bdcastRO();
214 }
215
216 #ifndef CMK_CHARE_USE_PTR
217
218 CkpvExtern(CkVec<void *>, chare_objs);
219 CkpvExtern(CkVec<int>, chare_types);
220 CkpvExtern(CkVec<VidBlock *>, vidblocks);
221
222 // handle plain non-migratable chare
223 void CkPupChareData(PUP::er &p)
224 {
225   int i, n;
226   if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
227   p|n;
228   for (i=0; i<n; i++) {
229         int chare_type;
230         if (!p.isUnpacking()) {
231                 chare_type = CkpvAccess(chare_types)[i];
232         }
233         p | chare_type;
234         if (p.isUnpacking()) {
235                 int migCtor = _chareTable[chare_type]->migCtor;
236                 if(migCtor==-1) {
237                         char buf[512];
238                         sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
239                         CkAbort(buf);
240                 }
241                 void *m = CkAllocSysMsg();
242                 envelope* env = UsrToEnv((CkMessage *)m);
243                 CkCreateLocalChare(migCtor, env);
244                 CkFreeSysMsg(m);
245         }
246         Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
247         obj->pup(p);
248   }
249
250   if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
251   p|n;
252   for (i=0; i<n; i++) {
253         VidBlock *v;
254         if (p.isUnpacking()) {
255                 v = new VidBlock();
256                 CkpvAccess(vidblocks).push_back(v);
257         }
258         else
259                 v = CkpvAccess(vidblocks)[i];
260         v->pup(p);
261   }
262 }
263 #else
264 void CkPupChareData(PUP::er &p)
265 {
266    // not implemented
267 }
268 #endif
269
270 void CkPupGroupData(PUP::er &p, CmiBool create)
271 {
272         int numGroups, i;
273
274         if (!p.isUnpacking()) {
275           numGroups = CkpvAccess(_groupIDTable)->size();
276         }
277         p|numGroups;
278         if (p.isUnpacking()) {
279           if(CkMyPe()==0)  
280             CkpvAccess(_numGroups) = numGroups+1; 
281           else 
282             CkpvAccess(_numGroups) = 1;
283         }
284         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
285
286         GroupInfo *tmpInfo = new GroupInfo [numGroups];
287         if (!p.isUnpacking()) {
288           for(i=0;i<numGroups;i++) {
289                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
290                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
291                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
292                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
293                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
294
295                 if(tmpInfo[i].MigCtor==-1) {
296                         char buf[512];
297                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
298                         CkAbort(buf);
299                 }
300           }
301         }
302         for (i=0; i<numGroups; i++) p|tmpInfo[i];
303
304         for(i=0;i<numGroups;i++) 
305         {
306           CkGroupID gID = tmpInfo[i].gID;
307           if (p.isUnpacking()) {
308             //CkpvAccess(_groupIDTable)->push_back(gID);
309             int eIdx = tmpInfo[i].MigCtor;
310             // error checking
311             if (eIdx == -1) {
312               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
313             }
314             void *m = CkAllocSysMsg();
315             envelope* env = UsrToEnv((CkMessage *)m);
316                 if(create)
317                     CkCreateLocalGroup(gID, eIdx, env);
318           }   // end of unPacking
319           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
320           // if using migration constructor, you'd better have a pup
321 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
322                 if(!create)
323                         gobj->mlogData->teamRecoveryFlag = 1;
324 #endif
325           gobj->pup(p);
326         }
327         delete [] tmpInfo;
328 }
329
330 // handle NodeGroupTable and data
331 void CkPupNodeGroupData(PUP::er &p, CmiBool create)
332 {
333         int numNodeGroups, i;
334         if (!p.isUnpacking()) {
335           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
336         }
337         p|numNodeGroups;
338         if (p.isUnpacking()) {
339           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
340           else { CksvAccess(_numNodeGroups) = 1; }
341         }
342
343         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
344         if (!p.isUnpacking()) {
345           for(i=0;i<numNodeGroups;i++) {
346                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
347                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
348                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
349                 if(tmpInfo[i].MigCtor==-1) {
350                         char buf[512];
351                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
352                                      declared as [migratable] in .ci to be able to checkpoint.",\
353                                      _chareTable[ent2.getcIdx()]->name);
354                         CkAbort(buf);
355                 }
356           }
357         }
358         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
359         for (i=0;i<numNodeGroups;i++) {
360                 CkGroupID gID = tmpInfo[i].gID;
361                 if (p.isUnpacking()) {
362                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
363                         int eIdx = tmpInfo[i].MigCtor;
364                         void *m = CkAllocSysMsg();
365                         envelope* env = UsrToEnv((CkMessage *)m);
366                         if(create){
367                                 CkCreateLocalNodeGroup(gID, eIdx, env);
368                         }
369                 }
370                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
371                 IrrGroup *obj = ent2.getObj();
372                 obj->pup(p);
373         }
374         delete [] tmpInfo;
375 }
376
377 // handle chare array elements for this processor
378 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
379 {
380         int i;
381         // safe in both packing/unpakcing at this stage
382         int numGroups = CkpvAccess(_groupIDTable)->size();
383
384         // number of array elements on this processor
385         int numElements;
386         if (!p.isUnpacking()) {
387           ElementCounter  counter;
388           CKLOCMGR_LOOP(mgr->iterate(counter););
389           numElements = counter.getCount();
390         }
391         p|numElements;
392
393         DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
394
395         if (!p.isUnpacking())
396         {
397           // let CkLocMgr to iterate and store every array elements
398           CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
399         }
400         else {
401           // loop and create all array elements ourselves
402           //CkPrintf("total chare array cnts: %d\n", numElements);
403           for (int i=0; i<numElements; i++) {
404                 CkGroupID gID;
405                 CkArrayIndex idx;
406                 p|gID;
407                 p|idx;
408                 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
409                 if (notifyListeners){
410                   mgr->resume(idx,p,CmiTrue);
411                 }
412                 else{
413                   mgr->restore(idx,p);
414                 }
415           }
416         }
417         // finish up
418         if (notifyListeners)
419         for(i=0;i<numGroups;i++) {
420                 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
421                 obj->ckJustMigrated();
422         }
423 }
424
425 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) ||CMK_MEM_CHECKPOINT
426 int  CkCountArrayElements(){
427     int numGroups = CkpvAccess(_groupIDTable)->size();
428     int i;
429     ElementCounter  counter;
430     CKLOCMGR_LOOP(mgr->iterate(counter););
431   int numElements = counter.getCount();
432     return numElements;
433 }
434 #endif
435
436 void CkPupProcessorData(PUP::er &p)
437 {
438     // save readonlys, and callback BTW
439     if(CkMyRank()==0) {
440         CkPupROData(p);
441     }
442
443     // save mainchares into MainChares.dat
444     if(CkMyPe()==0) {
445       CkPupMainChareData(p, NULL);
446     }
447         
448     // save non-migratable chare
449     CkPupChareData(p);
450
451     // save groups 
452         CkPupGroupData(p,CmiTrue);
453
454     // save nodegroups
455     if(CkMyRank()==0) {
456         CkPupNodeGroupData(p,CmiTrue);  
457     }
458
459     // pup array elements
460     CkPupArrayElementsData(p);
461 }
462
463 // called only on pe 0
464 static void checkpointOne(const char* dirname, CkCallback& cb){
465         CmiAssert(CkMyPe()==0);
466         char filename[1024];
467         
468         // save readonlys, and callback BTW
469         sprintf(filename,"%s/RO.dat",dirname);
470         FILE* fRO = CmiFopen(filename,"wb");
471         if(!fRO) CkAbort("Failed to create checkpoint file for readonly data!");
472         PUP::toDisk pRO(fRO);
473         int _numPes = CkNumPes();
474         pRO|_numPes;
475         CkPupROData(pRO);
476         pRO|cb;
477         CmiFclose(fRO);
478
479         // save mainchares into MainChares.dat
480         {
481                 sprintf(filename,"%s/MainChares.dat",dirname);
482                 FILE* fMain = CmiFopen(filename,"wb");
483                 if(!fMain) CkAbort("Failed to open checkpoint file for mainchare data!");
484                 PUP::toDisk pMain(fMain);
485                 CkPupMainChareData(pMain, NULL);
486                 CmiFclose(fMain);
487         }
488 }
489
490 void CkRemoveArrayElements()
491 {
492   int i;
493   int numGroups = CkpvAccess(_groupIDTable)->size();
494   CKLOCMGR_LOOP(mgr->flushAllRecs(););
495 /*  GroupTable *gTbl = CkpvAccess(_groupTable);
496   for(i=0; i<numGroups; i++){
497     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
498     if(obj->isLocMgr()) {
499         CkLocMgr *mgr = (CkLocMgr *)obj;
500         mgr->flushAllRecs();
501     }
502   }*/
503 }
504
505 /*
506 void CkTestArrayElements()
507 {
508   int i;
509   int numGroups = CkpvAccess(_groupIDTable)->size();
510   //CKLOCMGR_LOOP(mgr->flushAllRecs(););
511   GroupTable *gTbl = CkpvAccess(_groupTable);
512   for(i=0; i<numGroups; i++){
513     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
514     CkPrintf("An object at [%d]: %p | isLocMgr: %d\n", i, obj, obj->isLocMgr());
515   }
516 }
517 */
518
519 void CkStartCheckpoint(const char* dirname,const CkCallback& cb)
520 {
521
522         CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
523         
524         // hand over to checkpoint managers for per-processor checkpointing
525         CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb);
526 }
527
528 /**
529   * Restart: There's no such object as restart manager is created
530   *          because a group cannot restore itself anyway.
531   *          The mechanism exists as converse code and get invoked by
532   *          broadcast message.
533   **/
534
535 void CkRestartMain(const char* dirname, CkArgMsg *args){
536         int i;
537         char filename[1024];
538         CkCallback cb;
539         
540         _inrestart = 1;
541         _restarted = 1;
542         CkMemCheckPT::inRestarting = 1;
543
544         // restore readonlys
545         sprintf(filename,"%s/RO.dat",dirname);
546         FILE* fRO = CmiFopen(filename,"rb");
547         if(!fRO) CkAbort("Failed to open checkpoint file for readonly data!");
548         int _numPes = -1;
549         PUP::fromDisk pRO(fRO);
550         pRO|_numPes;
551         CkPupROData(pRO);
552         pRO|cb;
553         CmiFclose(fRO);
554         DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
555         _oldNumPes = _numPes;
556
557         CmiNodeBarrier();
558
559         // restore mainchares
560         sprintf(filename,"%s/MainChares.dat",dirname);
561         FILE* fMain = CmiFopen(filename,"rb");
562         if(fMain && CkMyPe()==0){ // only main chares have been checkpointed, we restart on PE0
563                 PUP::fromDisk pMain(fMain);
564                 CkPupMainChareData(pMain, args);
565                 CmiFclose(fMain);
566                 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
567                 //bdcastRO(); // moved to CkPupMainChareData()
568         }
569         
570 #ifndef CMK_CHARE_USE_PTR
571         // restore chares only when number of pes is the same 
572         if(CkNumPes() == _numPes) {
573                 sprintf(filename,"%s/Chares_%d.dat",dirname,CkMyPe());
574                 FILE* fChares = CmiFopen(filename,"rb");
575                 if(!fChares) CkAbort("Failed to open checkpoint file for chares!");
576                 PUP::fromDisk pChares(fChares);
577                 CkPupChareData(pChares);
578                 CmiFclose(fChares);
579                 _chareRestored = 1;
580         }
581 #endif
582
583         // restore groups
584         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
585         // restore from PE0's copy if shrink/expand
586         if(CkNumPes() != _numPes)
587                 sprintf(filename,"%s/Groups_0.dat",dirname);
588         else
589                 sprintf(filename,"%s/Groups_%d.dat",dirname,CkMyPe());
590         FILE* fGroups = CmiFopen(filename,"rb");
591         if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
592         PUP::fromDisk pGroups(fGroups);
593     CkPupGroupData(pGroups,CmiTrue);
594         CmiFclose(fGroups);
595
596         // restore nodegroups
597         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
598         if(CkMyRank()==0){
599                 if(CkNumPes() != _numPes)
600                         sprintf(filename,"%s/NodeGroups_0.dat",dirname);
601                 else
602                         sprintf(filename,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
603                 FILE* fNodeGroups = CmiFopen(filename,"rb");
604                 if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
605                 PUP::fromDisk pNodeGroups(fNodeGroups);
606         CkPupNodeGroupData(pNodeGroups,CmiTrue);
607                 CmiFclose(fNodeGroups);
608         }
609
610         // for each location, restore arrays
611         //DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
612         DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
613         if(CkMyPe() < _numPes)  // in normal range: restore, otherwise, do nothing
614           for (i=0; i<_numPes;i++) {
615             if (i%CkNumPes() == CkMyPe()) {
616               sprintf(filename,"%s/arr_%d.dat",dirname, i);
617               FILE *datFile=CmiFopen(filename,"rb");
618               if (datFile==NULL) CkAbort("Could not read data file");
619               PUP::fromDisk  p(datFile);
620               CkPupArrayElementsData(p);
621               CmiFclose(datFile);
622             }
623           }
624
625         _inrestart = 0;
626
627         _initDone();
628         CkMemCheckPT::inRestarting = 0;
629         if(CkMyPe()==0) {
630                 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
631                 
632                 cb.send();
633         }
634 }
635
636 // Main chare: initialize system checkpoint manager
637 class CkCheckpointInit : public Chare {
638 public:
639   CkCheckpointInit(CkArgMsg *msg) {
640     _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
641     delete msg;
642   }
643   CkCheckpointInit(CkMigrateMessage *m) {delete m;}
644 };
645
646 #include "CkCheckpoint.def.h"
647