2728c9fb75ad4d7a798e918f87299097925ebfec
[charm.git] / src / ck-core / ckcheckpoint.C
1 /*
2 Charm++ File: Checkpoint Library
3 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu
4
5 More documentation goes here...
6 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu
7     see ckcheckpoint.h for change log
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include "charm++.h"
14 #include "ck.h"
15 #include "ckcheckpoint.h"
16
17 #define DEBCHK  // CkPrintf
18
19 #define DEBUGC(x) x
20 //#define DEBUGC(x) 
21
22 CkGroupID _sysChkptMgr;
23
24 typedef struct _GroupInfo{
25         CkGroupID gID;
26         int MigCtor, DefCtor;
27         char name[256];
28 } GroupInfo;
29 PUPbytes(GroupInfo)
30 PUPmarshall(GroupInfo)
31
32 int _inrestart = 0;
33 int _restarted = 0;
34 int _oldNumPes = 0;
35 int _chareRestored = 0;
36
37 void CkCreateLocalChare(int epIdx, envelope *env);
38
39 // help class to find how many array elements
40 class ElementCounter : public CkLocIterator {
41 private:
42         int count;
43 public:
44         ElementCounter():count(0){};
45         void addLocation(CkLocation &loc)  { count++; }
46         int getCount() { return count; }
47 };
48
49 // helper class to pup all elements that belong to same ckLocMgr
50 class ElementCheckpointer : public CkLocIterator {
51 private:
52         CkLocMgr *locMgr;
53         PUP::er &p;
54 public:
55         ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
56         void addLocation(CkLocation &loc) {
57                 CkArrayIndexMax idx=loc.getIndex();
58                 CkGroupID gID = locMgr->ckGetGroupID();
59                 p|gID;      // store loc mgr's GID as well for easier restore
60                 p|idx;
61                 p|loc;
62                 //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
63         }
64 };
65
66
67 extern void _initDone();
68
69 static void bdcastRO(void){
70         int i;
71         //Determine the size of the RODataMessage
72         PUP::sizer ps;
73         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
74
75         //Allocate and fill out the RODataMessage
76         envelope *env = _allocEnv(RODataMsg, ps.size());
77         PUP::toMem pp((char *)EnvToUsr(env));
78         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
79         
80         env->setCount(++_numInitMsgs);
81         env->setSrcPe(CkMyPe());
82         CmiSetHandler(env, _roRestartHandlerIdx);
83         CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
84 }
85
86 // Print out an array index to this string as decimal fields
87 // separated by underscores.
88 void printIndex(const CkArrayIndex &idx,char *dest) {
89         const int *idxData=idx.data();
90         for (int i=0;i<idx.nInts;i++) {
91                 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
92                 dest+=strlen(dest);
93         }
94 }
95
96 static void checkpointOne(const char* dirname, CkCallback& cb);
97
98 // broadcast
99 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
100         chkptStartTimer = CmiWallTimer();
101         // every body make dir in case it is local directory
102         CmiMkdir(dirname);
103
104         if (CkMyPe() == 0) {
105           checkpointOne(dirname, cb);
106         }
107
108         char fileName[1024];
109
110 #ifndef CMK_CHARE_USE_PTR
111         // save groups into Chares.dat
112         sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
113         FILE* fChares = CmiFopen(fileName,"wb");
114         if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
115         PUP::toDisk pChares(fChares);
116         CkPupChareData(pChares);
117         CmiFclose(fChares);
118 #endif
119
120         // save groups into Groups.dat
121         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
122         sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
123         FILE* fGroups = CmiFopen(fileName,"wb");
124         if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
125         PUP::toDisk pGroups(fGroups);
126 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
127     CkPupGroupData(pGroups,CmiTrue);
128 #else
129     CkPupGroupData(pGroups);
130 #endif
131         CmiFclose(fGroups);
132
133         // save nodegroups into NodeGroups.dat
134         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
135         if (CkMyRank() == 0) {
136           sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
137           FILE* fNodeGroups = CmiFopen(fileName,"wb");
138           if(!fNodeGroups) 
139             CkAbort("Failed to create checkpoint file for nodegroup table!");
140           PUP::toDisk pNodeGroups(fNodeGroups);
141 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
142       CkPupNodeGroupData(pNodeGroups,CmiTrue);
143 #else
144       CkPupNodeGroupData(pNodeGroups);
145 #endif
146           CmiFclose(fNodeGroups);
147         }
148
149         //DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
150         sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
151         FILE *datFile=CmiFopen(fileName,"wb");
152         if (datFile==NULL) CkAbort("Could not create data file");
153         PUP::toDisk  p(datFile);
154         CkPupArrayElementsData(p);
155         CmiFclose(datFile);
156
157 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
158         system("sync");
159 #endif
160
161         restartCB = cb;
162         DEBCHK("[%d]restartCB installed\n",CkMyPe());
163         CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
164         contribute(0,NULL,CkReduction::sum_int,localcb);
165 }
166
167 void CkCheckpointMgr::SendRestartCB(CkReductionMsg *m){ 
168         delete m; 
169         DEBCHK("[%d]Sending out the cb\n",CkMyPe());
170         CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
171         restartCB.send(); 
172 }
173
174 void CkPupROData(PUP::er &p)
175 {
176         int _numReadonlies;
177         if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
178         p|_numReadonlies;
179         if (p.isUnpacking()) {
180           if (_numReadonlies != _readonlyTable.size())
181             CkAbort("You cannot add readonlies and restore from checkpoint...");
182         }
183         for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
184 }
185
186 // handle main chare
187 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
188 {
189         int nMains=_mainTable.size();
190         DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
191         for(int i=0;i<nMains;i++){  /* Create all mainchares */
192                 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
193                 int entryMigCtor = entry->getMigCtor();
194                 if(entryMigCtor!=-1) {
195                         Chare* obj;
196                         if (p.isUnpacking()) {
197                                 int size = entry->size;
198                                 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
199                                 obj = (Chare*)malloc(size);
200                                 _MEMCHECK(obj);
201                                 _mainTable[i]->setObj(obj);
202                                 //void *m = CkAllocSysMsg();
203                                 _entryTable[entryMigCtor]->call(args, obj);
204                         }
205                         else 
206                                 obj = (Chare *)_mainTable[i]->getObj();
207                         obj->pup(p);
208                 }
209         }
210         // to update mainchare proxy
211         // only readonly variables of Chare Proxy is taken care of here;
212         // in general, if chare proxy is contained in some data structure
213         // for example CkCallback, it is user's responsibility to
214         // update them after restarting
215         if (p.isUnpacking() && CkMyPe()==0)
216                 bdcastRO();
217 }
218
219 #ifndef CMK_CHARE_USE_PTR
220
221 CkpvExtern(CkVec<void *>, chare_objs);
222 CkpvExtern(CkVec<int>, chare_types);
223 CkpvExtern(CkVec<VidBlock *>, vidblocks);
224
225 // handle plain non-migratable chare
226 void CkPupChareData(PUP::er &p)
227 {
228   int i, n;
229   if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
230   p|n;
231   for (i=0; i<n; i++) {
232         int chare_type;
233         if (!p.isUnpacking()) {
234                 chare_type = CkpvAccess(chare_types)[i];
235         }
236         p | chare_type;
237         if (p.isUnpacking()) {
238                 int migCtor = _chareTable[chare_type]->migCtor;
239                 if(migCtor==-1) {
240                         char buf[512];
241                         sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
242                         CkAbort(buf);
243                 }
244                 void *m = CkAllocSysMsg();
245                 envelope* env = UsrToEnv((CkMessage *)m);
246                 CkCreateLocalChare(migCtor, env);
247                 CkFreeSysMsg(m);
248         }
249         Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
250         obj->pup(p);
251   }
252
253   if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
254   p|n;
255   for (i=0; i<n; i++) {
256         VidBlock *v;
257         if (p.isUnpacking()) {
258                 v = new VidBlock();
259                 CkpvAccess(vidblocks).push_back(v);
260         }
261         else
262                 v = CkpvAccess(vidblocks)[i];
263         v->pup(p);
264   }
265 }
266 #else
267 void CkPupChareData(PUP::er &p)
268 {
269    // not implemented
270 }
271 #endif
272
273 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
274 // handle GroupTable and data
275 void CkPupGroupData(PUP::er &p, CmiBool create)
276 {
277         int numGroups, i;
278
279         if (!p.isUnpacking()) {
280           numGroups = CkpvAccess(_groupIDTable)->size();
281         }
282         p|numGroups;
283         if (p.isUnpacking()) {
284           if(CkMyPe()==0)  
285             CkpvAccess(_numGroups) = numGroups+1; 
286           else 
287             CkpvAccess(_numGroups) = 1;
288         }
289         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
290
291         GroupInfo *tmpInfo = new GroupInfo [numGroups];
292         if (!p.isUnpacking()) {
293           for(i=0;i<numGroups;i++) {
294                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
295                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
296                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
297                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
298                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
299                 //CkPrintf("[%d] CkPupGroupData: %s group %s \n", CkMyPe(), p.typeString(), tmpInfo[i].name);
300
301                 if(tmpInfo[i].MigCtor==-1) {
302                         char buf[512];
303                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
304                         CkAbort(buf);
305                 }
306           }
307         }
308         for (i=0; i<numGroups; i++) p|tmpInfo[i];
309
310         for(i=0;i<numGroups;i++) 
311         {
312           CkGroupID gID = tmpInfo[i].gID;
313           if (p.isUnpacking()) {
314             //CkpvAccess(_groupIDTable)->push_back(gID);
315             int eIdx = tmpInfo[i].MigCtor;
316             // error checking
317             if (eIdx == -1) {
318               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
319             }
320             void *m = CkAllocSysMsg();
321             envelope* env = UsrToEnv((CkMessage *)m);
322                 if(create)
323                     CkCreateLocalGroup(gID, eIdx, env);
324           }   // end of unPacking
325           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
326           // if using migration constructor, you'd better have a pup
327                 if(!create)
328                         gobj->mlogData->teamRecoveryFlag = 1;
329           gobj->pup(p);
330          // CkPrintf("Group PUP'ed: gid = %d, name = %s\n",gobj->ckGetGroupID().idx, tmpInfo[i].name);
331         }
332         delete [] tmpInfo;
333 }
334
335 // handle NodeGroupTable and data
336 void CkPupNodeGroupData(PUP::er &p, CmiBool create)
337 {
338         int numNodeGroups, i;
339         if (!p.isUnpacking()) {
340           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
341         }
342         p|numNodeGroups;
343         if (p.isUnpacking()) {
344           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
345           else { CksvAccess(_numNodeGroups) = 1; }
346         }
347         if(CkMyPe() == 3)
348         CkPrintf("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
349
350         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
351         if (!p.isUnpacking()) {
352           for(i=0;i<numNodeGroups;i++) {
353                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
354                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
355                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
356                 if(tmpInfo[i].MigCtor==-1) {
357                         char buf[512];
358                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
359                                      declared as [migratable] in .ci to be able to checkpoint.",\
360                                      _chareTable[ent2.getcIdx()]->name);
361                         CkAbort(buf);
362                 }
363           }
364         }
365         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
366         for (i=0;i<numNodeGroups;i++) {
367                 CkGroupID gID = tmpInfo[i].gID;
368                 if (p.isUnpacking()) {
369                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
370                         int eIdx = tmpInfo[i].MigCtor;
371                         void *m = CkAllocSysMsg();
372                         envelope* env = UsrToEnv((CkMessage *)m);
373                         if(create){
374                                 CkCreateLocalNodeGroup(gID, eIdx, env);
375                         }
376                 }
377                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
378                 IrrGroup *obj = ent2.getObj();
379                 obj->pup(p);
380                 if(CkMyPe() == 3) CkPrintf("Nodegroup PUP'ed: gid = %d, name = %s\n",
381                         obj->ckGetGroupID().idx,
382                         _chareTable[ent2.getcIdx()]->name);
383         }
384         delete [] tmpInfo;
385 }
386 #else
387 // handle GroupTable and data
388 void CkPupGroupData(PUP::er &p)
389 {
390         int numGroups, i;
391
392         if (!p.isUnpacking()) {
393           numGroups = CkpvAccess(_groupIDTable)->size();
394         }
395         p|numGroups;
396         if (p.isUnpacking()) {
397           if(CkMyPe()==0)  
398             CkpvAccess(_numGroups) = numGroups+1; 
399           else 
400             CkpvAccess(_numGroups) = 1;
401         }
402         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
403
404         GroupInfo *tmpInfo = new GroupInfo [numGroups];
405         if (!p.isUnpacking()) {
406           for(i=0;i<numGroups;i++) {
407                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
408                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
409                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
410                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
411                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
412                 DEBCHK("[%d] CkPupGroupData: %s group %s \n",
413                         CkMyPe(), p.typeString(), tmpInfo[i].name);
414
415                 if(tmpInfo[i].MigCtor==-1) {
416                         char buf[512];
417                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
418                         CkAbort(buf);
419                 }
420           }
421         }
422         for (i=0; i<numGroups; i++) p|tmpInfo[i];
423
424         for(i=0;i<numGroups;i++) 
425         {
426           CkGroupID gID = tmpInfo[i].gID;
427           if (p.isUnpacking()) {
428             //CkpvAccess(_groupIDTable)->push_back(gID);
429             int eIdx = tmpInfo[i].MigCtor;
430             // error checking
431             if (eIdx == -1) {
432               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
433             }
434             void *m = CkAllocSysMsg();
435             envelope* env = UsrToEnv((CkMessage *)m);
436             CkCreateLocalGroup(gID, eIdx, env);
437           }   // end of unPacking
438           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
439           // if using migration constructor, you'd better have a pup
440           gobj->pup(p);
441           DEBCHK("Group PUP'ed: gid = %d, name = %s\n",
442                         gobj->ckGetGroupID().idx, tmpInfo[i].name);
443         }
444         delete [] tmpInfo;
445 }
446
447 // handle NodeGroupTable and data
448 void CkPupNodeGroupData(PUP::er &p)
449 {
450         int numNodeGroups, i;
451         if (!p.isUnpacking()) {
452           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
453         }
454         p|numNodeGroups;
455         if (p.isUnpacking()) {
456           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
457           else { CksvAccess(_numNodeGroups) = 1; }
458         }
459         DEBCHK("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
460
461         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
462         if (!p.isUnpacking()) {
463           for(i=0;i<numNodeGroups;i++) {
464                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
465                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
466                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
467                 if(tmpInfo[i].MigCtor==-1) {
468                         char buf[512];
469                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
470                                      declared as [migratable] in .ci to be able to checkpoint.",\
471                                      _chareTable[ent2.getcIdx()]->name);
472                         CkAbort(buf);
473                 }
474           }
475         }
476         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
477         for (i=0;i<numNodeGroups;i++) {
478                 CkGroupID gID = tmpInfo[i].gID;
479                 if (p.isUnpacking()) {
480                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
481                         int eIdx = tmpInfo[i].MigCtor;
482                         void *m = CkAllocSysMsg();
483                         envelope* env = UsrToEnv((CkMessage *)m);
484                         CkCreateLocalNodeGroup(gID, eIdx, env);
485                 }
486                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
487                 IrrGroup *obj = ent2.getObj();
488                 obj->pup(p);
489                 DEBCHK("Nodegroup PUP'ed: gid = %d, name = %s\n",
490                         obj->ckGetGroupID().idx,
491                         _chareTable[ent2.getcIdx()]->name);
492         }
493         delete [] tmpInfo;
494 }
495 #endif
496
497 // handle chare array elements for this processor
498 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
499 {
500         int i;
501         // safe in both packing/unpakcing at this stage
502         int numGroups = CkpvAccess(_groupIDTable)->size();
503
504         // number of array elements on this processor
505         int numElements;
506         if (!p.isUnpacking()) {
507           ElementCounter  counter;
508           CKLOCMGR_LOOP(mgr->iterate(counter););
509           numElements = counter.getCount();
510         }
511         p|numElements;
512
513         DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
514
515         if (!p.isUnpacking())
516         {
517           // let CkLocMgr to iterate and store every array elements
518           CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
519         }
520         else {
521           // loop and create all array elements ourselves
522           //CkPrintf("total chare array cnts: %d\n", numElements);
523           for (int i=0; i<numElements; i++) {
524                 CkGroupID gID;
525                 CkArrayIndexMax idx;
526                 p|gID;
527                 p|idx;
528                 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
529                 if (notifyListeners){
530                   mgr->resume(idx,p,CmiTrue);
531                 }
532                 else{
533                   mgr->restore(idx,p);
534                 }
535           }
536         }
537         // finish up
538         if (notifyListeners)
539         for(i=0;i<numGroups;i++) {
540                 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
541                 obj->ckJustMigrated();
542         }
543 }
544
545 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
546 int  CkCountArrayElements(){
547     int numGroups = CkpvAccess(_groupIDTable)->size();
548     int i;
549     ElementCounter  counter;
550     CKLOCMGR_LOOP(mgr->iterate(counter););
551   int numElements = counter.getCount();
552     return numElements;
553 }
554 #endif
555
556 void CkPupProcessorData(PUP::er &p)
557 {
558     // save readonlys, and callback BTW
559     if(CkMyRank()==0) {
560         CkPupROData(p);
561     }
562
563     // save mainchares into MainChares.dat
564     if(CkMyPe()==0) {
565       CkPupMainChareData(p, NULL);
566     }
567         
568     // save non-migratable chare
569     CkPupChareData(p);
570
571     // save groups 
572 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
573     CkPupGroupData(p,CmiTrue);
574 #else
575     CkPupGroupData(p);
576 #endif
577
578     // save nodegroups
579     if(CkMyRank()==0) {
580 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
581         CkPupNodeGroupData(p,CmiTrue);  
582 #else
583         CkPupNodeGroupData(p);
584 #endif
585     }
586
587     // pup array elements
588     CkPupArrayElementsData(p);
589 }
590
591 // called only on pe 0
592 static void checkpointOne(const char* dirname, CkCallback& cb){
593         CmiAssert(CkMyPe()==0);
594         int i;
595         char filename[1024];
596         
597         // save readonlys, and callback BTW
598         sprintf(filename,"%s/RO.dat",dirname);
599         FILE* fRO = CmiFopen(filename,"wb");
600         if(!fRO) CkAbort("Failed to create checkpoint file for readonly data!");
601         PUP::toDisk pRO(fRO);
602         int _numPes = CkNumPes();
603         pRO|_numPes;
604         CkPupROData(pRO);
605         pRO|cb;
606         CmiFclose(fRO);
607
608         // save mainchares into MainChares.dat
609         {
610                 sprintf(filename,"%s/MainChares.dat",dirname);
611                 FILE* fMain = CmiFopen(filename,"wb");
612                 if(!fMain) CkAbort("Failed to open checkpoint file for mainchare data!");
613                 PUP::toDisk pMain(fMain);
614                 CkPupMainChareData(pMain, NULL);
615                 CmiFclose(fMain);
616         }
617 }
618
619 void CkRemoveArrayElements()
620 {
621   int i;
622   int numGroups = CkpvAccess(_groupIDTable)->size();
623   CKLOCMGR_LOOP(mgr->flushAllRecs(););
624 /*  GroupTable *gTbl = CkpvAccess(_groupTable);
625   for(i=0; i<numGroups; i++){
626     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
627     if(obj->isLocMgr()) {
628         CkLocMgr *mgr = (CkLocMgr *)obj;
629         mgr->flushAllRecs();
630     }
631   }*/
632 }
633
634 /*
635 void CkTestArrayElements()
636 {
637   int i;
638   int numGroups = CkpvAccess(_groupIDTable)->size();
639   //CKLOCMGR_LOOP(mgr->flushAllRecs(););
640   GroupTable *gTbl = CkpvAccess(_groupTable);
641   for(i=0; i<numGroups; i++){
642     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
643     CkPrintf("An object at [%d]: %p | isLocMgr: %d\n", i, obj, obj->isLocMgr());
644   }
645 }
646 */
647
648 void CkStartCheckpoint(char* dirname,const CkCallback& cb)
649 {
650         CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
651         
652         // hand over to checkpoint managers for per-processor checkpointing
653         CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint((char *)dirname, cb);
654 }
655
656 /**
657   * Restart: There's no such object as restart manager is created
658   *          because a group cannot restore itself anyway.
659   *          The mechanism exists as converse code and get invoked by
660   *          broadcast message.
661   **/
662
663 void CkRestartMain(const char* dirname, CkArgMsg *args){
664         int i;
665         char filename[1024];
666         CkCallback cb;
667         
668         _inrestart = 1;
669         _restarted = 1;
670
671         // restore readonlys
672         sprintf(filename,"%s/RO.dat",dirname);
673         FILE* fRO = CmiFopen(filename,"rb");
674         if(!fRO) CkAbort("Failed to open checkpoint file for readonly data!");
675         int _numPes = -1;
676         PUP::fromDisk pRO(fRO);
677         pRO|_numPes;
678         CkPupROData(pRO);
679         pRO|cb;
680         CmiFclose(fRO);
681         DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
682         _oldNumPes = _numPes;
683
684         CmiNodeBarrier();
685
686         // restore mainchares
687         sprintf(filename,"%s/MainChares.dat",dirname);
688         FILE* fMain = CmiFopen(filename,"rb");
689         if(fMain && CkMyPe()==0){ // only main chares have been checkpointed, we restart on PE0
690                 PUP::fromDisk pMain(fMain);
691                 CkPupMainChareData(pMain, args);
692                 CmiFclose(fMain);
693                 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
694                 //bdcastRO(); // moved to CkPupMainChareData()
695         }
696         
697 #ifndef CMK_CHARE_USE_PTR
698         // restore chares only when number of pes is the same 
699         if(CkNumPes() == _numPes) {
700                 sprintf(filename,"%s/Chares_%d.dat",dirname,CkMyPe());
701                 FILE* fChares = CmiFopen(filename,"rb");
702                 if(!fChares) CkAbort("Failed to open checkpoint file for chares!");
703                 PUP::fromDisk pChares(fChares);
704                 CkPupChareData(pChares);
705                 CmiFclose(fChares);
706                 _chareRestored = 1;
707         }
708 #endif
709
710         // restore groups
711         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
712         // restore from PE0's copy if shrink/expand
713         if(CkNumPes() != _numPes)
714                 sprintf(filename,"%s/Groups_0.dat",dirname);
715         else
716                 sprintf(filename,"%s/Groups_%d.dat",dirname,CkMyPe());
717         FILE* fGroups = CmiFopen(filename,"rb");
718         if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
719         PUP::fromDisk pGroups(fGroups);
720 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
721     CkPupGroupData(pGroups,CmiTrue);
722 #else
723     CkPupGroupData(pGroups);
724 #endif
725         CmiFclose(fGroups);
726
727         // restore nodegroups
728         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
729         if(CkMyRank()==0){
730                 if(CkNumPes() != _numPes)
731                         sprintf(filename,"%s/NodeGroups_0.dat",dirname);
732                 else
733                         sprintf(filename,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
734                 FILE* fNodeGroups = CmiFopen(filename,"rb");
735                 if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
736                 PUP::fromDisk pNodeGroups(fNodeGroups);
737 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
738         CkPupNodeGroupData(pNodeGroups,CmiTrue);
739 #else
740         CkPupNodeGroupData(pNodeGroups);
741 #endif
742                 CmiFclose(fNodeGroups);
743         }
744
745         // for each location, restore arrays
746         //DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
747         DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
748         if(CkMyPe() < _numPes)  // in normal range: restore, otherwise, do nothing
749           for (i=0; i<_numPes;i++) {
750             if (i%CkNumPes() == CkMyPe()) {
751               sprintf(filename,"%s/arr_%d.dat",dirname, i);
752               FILE *datFile=CmiFopen(filename,"rb");
753               if (datFile==NULL) CkAbort("Could not read data file");
754               PUP::fromDisk  p(datFile);
755               CkPupArrayElementsData(p);
756               CmiFclose(datFile);
757             }
758           }
759
760         _inrestart = 0;
761
762         _initDone();
763
764         if(CkMyPe()==0) {
765                 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
766                 cb.send();
767         }
768 }
769
770 // Main chare: initialize system checkpoint manager
771 class CkCheckpointInit : public Chare {
772 public:
773   CkCheckpointInit(CkArgMsg *msg) {
774     _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
775     delete msg;
776   }
777   CkCheckpointInit(CkMigrateMessage *m) {delete m;}
778 };
779
780 #include "CkCheckpoint.def.h"
781