e408f2cad1a02ddf0d7838a874e19fb8501ae50d
[charm.git] / src / ck-core / ckcheckpoint.C
1 /*
2 Charm++ File: Checkpoint Library
3 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu
4
5 More documentation goes here...
6 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu
7     see ckcheckpoint.h for change log
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include "charm++.h"
14 #include "ck.h"
15 #include "ckcheckpoint.h"
16
17 void noopit(const char*, ...)
18 {}
19
20 //#define DEBCHK  // CkPrintf
21 #define DEBCHK noopit
22
23 #define DEBUGC(x) x
24 //#define DEBUGC(x) 
25
26 CkGroupID _sysChkptMgr;
27
28 typedef struct _GroupInfo{
29         CkGroupID gID;
30         int MigCtor, DefCtor;
31         char name[256];
32 } GroupInfo;
33 PUPbytes(GroupInfo)
34 PUPmarshall(GroupInfo)
35
36 int _inrestart = 0;
37 int _restarted = 0;
38 int _oldNumPes = 0;
39 int _chareRestored = 0;
40
41 void CkCreateLocalChare(int epIdx, envelope *env);
42
43 // help class to find how many array elements
44 class ElementCounter : public CkLocIterator {
45 private:
46         int count;
47 public:
48         ElementCounter():count(0){};
49         void addLocation(CkLocation &loc)  { count++; }
50         int getCount() { return count; }
51 };
52
53 // helper class to pup all elements that belong to same ckLocMgr
54 class ElementCheckpointer : public CkLocIterator {
55 private:
56         CkLocMgr *locMgr;
57         PUP::er &p;
58 public:
59         ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
60         void addLocation(CkLocation &loc) {
61                 CkArrayIndex idx=loc.getIndex();
62                 CkGroupID gID = locMgr->ckGetGroupID();
63                 p|gID;      // store loc mgr's GID as well for easier restore
64                 p|idx;
65                 p|loc;
66                 //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
67         }
68 };
69
70
71 extern void _initDone();
72
73 static void bdcastRO(void){
74         int i;
75         //Determine the size of the RODataMessage
76         PUP::sizer ps;
77         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
78
79         //Allocate and fill out the RODataMessage
80         envelope *env = _allocEnv(RODataMsg, ps.size());
81         PUP::toMem pp((char *)EnvToUsr(env));
82         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
83         
84         env->setCount(++_numInitMsgs);
85         env->setSrcPe(CkMyPe());
86         CmiSetHandler(env, _roRestartHandlerIdx);
87         CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
88 }
89
90 // Print out an array index to this string as decimal fields
91 // separated by underscores.
92 void printIndex(const CkArrayIndex &idx,char *dest) {
93         const int *idxData=idx.data();
94         for (int i=0;i<idx.nInts;i++) {
95                 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
96                 dest+=strlen(dest);
97         }
98 }
99
100 static void checkpointOne(const char* dirname, CkCallback& cb);
101
102 // broadcast
103 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
104         chkptStartTimer = CmiWallTimer();
105         // every body make dir in case it is local directory
106         CmiMkdir(dirname);
107
108         if (CkMyPe() == 0) {
109           checkpointOne(dirname, cb);
110         }
111
112         char fileName[1024];
113
114 #ifndef CMK_CHARE_USE_PTR
115         // save groups into Chares.dat
116         sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
117         FILE* fChares = CmiFopen(fileName,"wb");
118         if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
119         PUP::toDisk pChares(fChares);
120         CkPupChareData(pChares);
121         CmiFclose(fChares);
122 #endif
123
124         // save groups into Groups.dat
125         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
126         sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
127         FILE* fGroups = CmiFopen(fileName,"wb");
128         if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
129         PUP::toDisk pGroups(fGroups);
130 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
131     CkPupGroupData(pGroups,CmiTrue);
132 #else
133     CkPupGroupData(pGroups);
134 #endif
135         CmiFclose(fGroups);
136
137         // save nodegroups into NodeGroups.dat
138         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
139         if (CkMyRank() == 0) {
140           sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
141           FILE* fNodeGroups = CmiFopen(fileName,"wb");
142           if(!fNodeGroups) 
143             CkAbort("Failed to create checkpoint file for nodegroup table!");
144           PUP::toDisk pNodeGroups(fNodeGroups);
145 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
146       CkPupNodeGroupData(pNodeGroups,CmiTrue);
147 #else
148       CkPupNodeGroupData(pNodeGroups);
149 #endif
150           CmiFclose(fNodeGroups);
151         }
152
153         //DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
154         sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
155         FILE *datFile=CmiFopen(fileName,"wb");
156         if (datFile==NULL) CkAbort("Could not create data file");
157         PUP::toDisk  p(datFile);
158         CkPupArrayElementsData(p);
159         CmiFclose(datFile);
160
161 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
162         system("sync");
163 #endif
164
165         restartCB = cb;
166         DEBCHK("[%d]restartCB installed\n",CkMyPe());
167         CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
168         contribute(0,NULL,CkReduction::sum_int,localcb);
169 }
170
171 void CkCheckpointMgr::SendRestartCB(CkReductionMsg *m){ 
172         delete m; 
173         DEBCHK("[%d]Sending out the cb\n",CkMyPe());
174         CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
175         restartCB.send(); 
176 }
177
178 void CkPupROData(PUP::er &p)
179 {
180         int _numReadonlies;
181         if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
182         p|_numReadonlies;
183         if (p.isUnpacking()) {
184           if (_numReadonlies != _readonlyTable.size())
185             CkAbort("You cannot add readonlies and restore from checkpoint...");
186         }
187         for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
188 }
189
190 // handle main chare
191 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
192 {
193         int nMains=_mainTable.size();
194         DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
195         for(int i=0;i<nMains;i++){  /* Create all mainchares */
196                 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
197                 int entryMigCtor = entry->getMigCtor();
198                 if(entryMigCtor!=-1) {
199                         Chare* obj;
200                         if (p.isUnpacking()) {
201                                 int size = entry->size;
202                                 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
203                                 obj = (Chare*)malloc(size);
204                                 _MEMCHECK(obj);
205                                 _mainTable[i]->setObj(obj);
206                                 //void *m = CkAllocSysMsg();
207                                 _entryTable[entryMigCtor]->call(args, obj);
208                         }
209                         else 
210                                 obj = (Chare *)_mainTable[i]->getObj();
211                         obj->pup(p);
212                 }
213         }
214         // to update mainchare proxy
215         // only readonly variables of Chare Proxy is taken care of here;
216         // in general, if chare proxy is contained in some data structure
217         // for example CkCallback, it is user's responsibility to
218         // update them after restarting
219         if (p.isUnpacking() && CkMyPe()==0)
220                 bdcastRO();
221 }
222
223 #ifndef CMK_CHARE_USE_PTR
224
225 CkpvExtern(CkVec<void *>, chare_objs);
226 CkpvExtern(CkVec<int>, chare_types);
227 CkpvExtern(CkVec<VidBlock *>, vidblocks);
228
229 // handle plain non-migratable chare
230 void CkPupChareData(PUP::er &p)
231 {
232   int i, n;
233   if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
234   p|n;
235   for (i=0; i<n; i++) {
236         int chare_type;
237         if (!p.isUnpacking()) {
238                 chare_type = CkpvAccess(chare_types)[i];
239         }
240         p | chare_type;
241         if (p.isUnpacking()) {
242                 int migCtor = _chareTable[chare_type]->migCtor;
243                 if(migCtor==-1) {
244                         char buf[512];
245                         sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
246                         CkAbort(buf);
247                 }
248                 void *m = CkAllocSysMsg();
249                 envelope* env = UsrToEnv((CkMessage *)m);
250                 CkCreateLocalChare(migCtor, env);
251                 CkFreeSysMsg(m);
252         }
253         Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
254         obj->pup(p);
255   }
256
257   if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
258   p|n;
259   for (i=0; i<n; i++) {
260         VidBlock *v;
261         if (p.isUnpacking()) {
262                 v = new VidBlock();
263                 CkpvAccess(vidblocks).push_back(v);
264         }
265         else
266                 v = CkpvAccess(vidblocks)[i];
267         v->pup(p);
268   }
269 }
270 #else
271 void CkPupChareData(PUP::er &p)
272 {
273    // not implemented
274 }
275 #endif
276
277 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
278 // handle GroupTable and data
279 void CkPupGroupData(PUP::er &p, CmiBool create)
280 {
281         int numGroups, i;
282
283         if (!p.isUnpacking()) {
284           numGroups = CkpvAccess(_groupIDTable)->size();
285         }
286         p|numGroups;
287         if (p.isUnpacking()) {
288           if(CkMyPe()==0)  
289             CkpvAccess(_numGroups) = numGroups+1; 
290           else 
291             CkpvAccess(_numGroups) = 1;
292         }
293         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
294
295         GroupInfo *tmpInfo = new GroupInfo [numGroups];
296         if (!p.isUnpacking()) {
297           for(i=0;i<numGroups;i++) {
298                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
299                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
300                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
301                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
302                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
303                 //CkPrintf("[%d] CkPupGroupData: %s group %s \n", CkMyPe(), p.typeString(), tmpInfo[i].name);
304
305                 if(tmpInfo[i].MigCtor==-1) {
306                         char buf[512];
307                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
308                         CkAbort(buf);
309                 }
310           }
311         }
312         for (i=0; i<numGroups; i++) p|tmpInfo[i];
313
314         for(i=0;i<numGroups;i++) 
315         {
316           CkGroupID gID = tmpInfo[i].gID;
317           if (p.isUnpacking()) {
318             //CkpvAccess(_groupIDTable)->push_back(gID);
319             int eIdx = tmpInfo[i].MigCtor;
320             // error checking
321             if (eIdx == -1) {
322               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
323             }
324             void *m = CkAllocSysMsg();
325             envelope* env = UsrToEnv((CkMessage *)m);
326                 if(create)
327                     CkCreateLocalGroup(gID, eIdx, env);
328           }   // end of unPacking
329           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
330           // if using migration constructor, you'd better have a pup
331                 if(!create)
332                         gobj->mlogData->teamRecoveryFlag = 1;
333           gobj->pup(p);
334          // CkPrintf("Group PUP'ed: gid = %d, name = %s\n",gobj->ckGetGroupID().idx, tmpInfo[i].name);
335         }
336         delete [] tmpInfo;
337 }
338
339 // handle NodeGroupTable and data
340 void CkPupNodeGroupData(PUP::er &p, CmiBool create)
341 {
342         int numNodeGroups, i;
343         if (!p.isUnpacking()) {
344           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
345         }
346         p|numNodeGroups;
347         if (p.isUnpacking()) {
348           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
349           else { CksvAccess(_numNodeGroups) = 1; }
350         }
351         if(CkMyPe() == 3)
352         CkPrintf("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
353
354         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
355         if (!p.isUnpacking()) {
356           for(i=0;i<numNodeGroups;i++) {
357                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
358                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
359                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
360                 if(tmpInfo[i].MigCtor==-1) {
361                         char buf[512];
362                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
363                                      declared as [migratable] in .ci to be able to checkpoint.",\
364                                      _chareTable[ent2.getcIdx()]->name);
365                         CkAbort(buf);
366                 }
367           }
368         }
369         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
370         for (i=0;i<numNodeGroups;i++) {
371                 CkGroupID gID = tmpInfo[i].gID;
372                 if (p.isUnpacking()) {
373                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
374                         int eIdx = tmpInfo[i].MigCtor;
375                         void *m = CkAllocSysMsg();
376                         envelope* env = UsrToEnv((CkMessage *)m);
377                         if(create){
378                                 CkCreateLocalNodeGroup(gID, eIdx, env);
379                         }
380                 }
381                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
382                 IrrGroup *obj = ent2.getObj();
383                 obj->pup(p);
384                 if(CkMyPe() == 3) CkPrintf("Nodegroup PUP'ed: gid = %d, name = %s\n",
385                         obj->ckGetGroupID().idx,
386                         _chareTable[ent2.getcIdx()]->name);
387         }
388         delete [] tmpInfo;
389 }
390 #else
391 // handle GroupTable and data
392 void CkPupGroupData(PUP::er &p)
393 {
394         int numGroups, i;
395
396         if (!p.isUnpacking()) {
397           numGroups = CkpvAccess(_groupIDTable)->size();
398         }
399         p|numGroups;
400         if (p.isUnpacking()) {
401           if(CkMyPe()==0)  
402             CkpvAccess(_numGroups) = numGroups+1; 
403           else 
404             CkpvAccess(_numGroups) = 1;
405         }
406         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
407
408         GroupInfo *tmpInfo = new GroupInfo [numGroups];
409         if (!p.isUnpacking()) {
410           for(i=0;i<numGroups;i++) {
411                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
412                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
413                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
414                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
415                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
416                 DEBCHK("[%d] CkPupGroupData: %s group %s \n",
417                         CkMyPe(), p.typeString(), tmpInfo[i].name);
418
419                 if(tmpInfo[i].MigCtor==-1) {
420                         char buf[512];
421                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
422                         CkAbort(buf);
423                 }
424           }
425         }
426         for (i=0; i<numGroups; i++) p|tmpInfo[i];
427
428         for(i=0;i<numGroups;i++) 
429         {
430           CkGroupID gID = tmpInfo[i].gID;
431           if (p.isUnpacking()) {
432             //CkpvAccess(_groupIDTable)->push_back(gID);
433             int eIdx = tmpInfo[i].MigCtor;
434             // error checking
435             if (eIdx == -1) {
436               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
437             }
438             void *m = CkAllocSysMsg();
439             envelope* env = UsrToEnv((CkMessage *)m);
440             CkCreateLocalGroup(gID, eIdx, env);
441           }   // end of unPacking
442           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
443           // if using migration constructor, you'd better have a pup
444           gobj->pup(p);
445           DEBCHK("Group PUP'ed: gid = %d, name = %s\n",
446                         gobj->ckGetGroupID().idx, tmpInfo[i].name);
447         }
448         delete [] tmpInfo;
449 }
450
451 // handle NodeGroupTable and data
452 void CkPupNodeGroupData(PUP::er &p)
453 {
454         int numNodeGroups, i;
455         if (!p.isUnpacking()) {
456           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
457         }
458         p|numNodeGroups;
459         if (p.isUnpacking()) {
460           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
461           else { CksvAccess(_numNodeGroups) = 1; }
462         }
463         DEBCHK("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
464
465         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
466         if (!p.isUnpacking()) {
467           for(i=0;i<numNodeGroups;i++) {
468                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
469                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
470                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
471                 if(tmpInfo[i].MigCtor==-1) {
472                         char buf[512];
473                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
474                                      declared as [migratable] in .ci to be able to checkpoint.",\
475                                      _chareTable[ent2.getcIdx()]->name);
476                         CkAbort(buf);
477                 }
478           }
479         }
480         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
481         for (i=0;i<numNodeGroups;i++) {
482                 CkGroupID gID = tmpInfo[i].gID;
483                 if (p.isUnpacking()) {
484                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
485                         int eIdx = tmpInfo[i].MigCtor;
486                         void *m = CkAllocSysMsg();
487                         envelope* env = UsrToEnv((CkMessage *)m);
488                         CkCreateLocalNodeGroup(gID, eIdx, env);
489                 }
490                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
491                 IrrGroup *obj = ent2.getObj();
492                 obj->pup(p);
493                 DEBCHK("Nodegroup PUP'ed: gid = %d, name = %s\n",
494                         obj->ckGetGroupID().idx,
495                         _chareTable[ent2.getcIdx()]->name);
496         }
497         delete [] tmpInfo;
498 }
499 #endif
500
501 // handle chare array elements for this processor
502 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
503 {
504         int i;
505         // safe in both packing/unpakcing at this stage
506         int numGroups = CkpvAccess(_groupIDTable)->size();
507
508         // number of array elements on this processor
509         int numElements;
510         if (!p.isUnpacking()) {
511           ElementCounter  counter;
512           CKLOCMGR_LOOP(mgr->iterate(counter););
513           numElements = counter.getCount();
514         }
515         p|numElements;
516
517         DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
518
519         if (!p.isUnpacking())
520         {
521           // let CkLocMgr to iterate and store every array elements
522           CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
523         }
524         else {
525           // loop and create all array elements ourselves
526           //CkPrintf("total chare array cnts: %d\n", numElements);
527           for (int i=0; i<numElements; i++) {
528                 CkGroupID gID;
529                 CkArrayIndex idx;
530                 p|gID;
531                 p|idx;
532                 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
533                 if (notifyListeners){
534                   mgr->resume(idx,p,CmiTrue);
535                 }
536                 else{
537                   mgr->restore(idx,p);
538                 }
539           }
540         }
541         // finish up
542         if (notifyListeners)
543         for(i=0;i<numGroups;i++) {
544                 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
545                 obj->ckJustMigrated();
546         }
547 }
548
549 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
550 int  CkCountArrayElements(){
551     int numGroups = CkpvAccess(_groupIDTable)->size();
552     int i;
553     ElementCounter  counter;
554     CKLOCMGR_LOOP(mgr->iterate(counter););
555   int numElements = counter.getCount();
556     return numElements;
557 }
558 #endif
559
560 void CkPupProcessorData(PUP::er &p)
561 {
562     // save readonlys, and callback BTW
563     if(CkMyRank()==0) {
564         CkPupROData(p);
565     }
566
567     // save mainchares into MainChares.dat
568     if(CkMyPe()==0) {
569       CkPupMainChareData(p, NULL);
570     }
571         
572     // save non-migratable chare
573     CkPupChareData(p);
574
575     // save groups 
576 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
577     CkPupGroupData(p,CmiTrue);
578 #else
579     CkPupGroupData(p);
580 #endif
581
582     // save nodegroups
583     if(CkMyRank()==0) {
584 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
585         CkPupNodeGroupData(p,CmiTrue);  
586 #else
587         CkPupNodeGroupData(p);
588 #endif
589     }
590
591     // pup array elements
592     CkPupArrayElementsData(p);
593 }
594
595 // called only on pe 0
596 static void checkpointOne(const char* dirname, CkCallback& cb){
597         CmiAssert(CkMyPe()==0);
598         int i;
599         char filename[1024];
600         
601         // save readonlys, and callback BTW
602         sprintf(filename,"%s/RO.dat",dirname);
603         FILE* fRO = CmiFopen(filename,"wb");
604         if(!fRO) CkAbort("Failed to create checkpoint file for readonly data!");
605         PUP::toDisk pRO(fRO);
606         int _numPes = CkNumPes();
607         pRO|_numPes;
608         CkPupROData(pRO);
609         pRO|cb;
610         CmiFclose(fRO);
611
612         // save mainchares into MainChares.dat
613         {
614                 sprintf(filename,"%s/MainChares.dat",dirname);
615                 FILE* fMain = CmiFopen(filename,"wb");
616                 if(!fMain) CkAbort("Failed to open checkpoint file for mainchare data!");
617                 PUP::toDisk pMain(fMain);
618                 CkPupMainChareData(pMain, NULL);
619                 CmiFclose(fMain);
620         }
621 }
622
623 void CkRemoveArrayElements()
624 {
625   int i;
626   int numGroups = CkpvAccess(_groupIDTable)->size();
627   CKLOCMGR_LOOP(mgr->flushAllRecs(););
628 /*  GroupTable *gTbl = CkpvAccess(_groupTable);
629   for(i=0; i<numGroups; i++){
630     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
631     if(obj->isLocMgr()) {
632         CkLocMgr *mgr = (CkLocMgr *)obj;
633         mgr->flushAllRecs();
634     }
635   }*/
636 }
637
638 /*
639 void CkTestArrayElements()
640 {
641   int i;
642   int numGroups = CkpvAccess(_groupIDTable)->size();
643   //CKLOCMGR_LOOP(mgr->flushAllRecs(););
644   GroupTable *gTbl = CkpvAccess(_groupTable);
645   for(i=0; i<numGroups; i++){
646     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
647     CkPrintf("An object at [%d]: %p | isLocMgr: %d\n", i, obj, obj->isLocMgr());
648   }
649 }
650 */
651
652 void CkStartCheckpoint(const char* dirname,const CkCallback& cb)
653 {
654         CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
655         
656         // hand over to checkpoint managers for per-processor checkpointing
657         CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb);
658 }
659
660 /**
661   * Restart: There's no such object as restart manager is created
662   *          because a group cannot restore itself anyway.
663   *          The mechanism exists as converse code and get invoked by
664   *          broadcast message.
665   **/
666
667 void CkRestartMain(const char* dirname, CkArgMsg *args){
668         int i;
669         char filename[1024];
670         CkCallback cb;
671         
672         _inrestart = 1;
673         _restarted = 1;
674
675         // restore readonlys
676         sprintf(filename,"%s/RO.dat",dirname);
677         FILE* fRO = CmiFopen(filename,"rb");
678         if(!fRO) CkAbort("Failed to open checkpoint file for readonly data!");
679         int _numPes = -1;
680         PUP::fromDisk pRO(fRO);
681         pRO|_numPes;
682         CkPupROData(pRO);
683         pRO|cb;
684         CmiFclose(fRO);
685         DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
686         _oldNumPes = _numPes;
687
688         CmiNodeBarrier();
689
690         // restore mainchares
691         sprintf(filename,"%s/MainChares.dat",dirname);
692         FILE* fMain = CmiFopen(filename,"rb");
693         if(fMain && CkMyPe()==0){ // only main chares have been checkpointed, we restart on PE0
694                 PUP::fromDisk pMain(fMain);
695                 CkPupMainChareData(pMain, args);
696                 CmiFclose(fMain);
697                 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
698                 //bdcastRO(); // moved to CkPupMainChareData()
699         }
700         
701 #ifndef CMK_CHARE_USE_PTR
702         // restore chares only when number of pes is the same 
703         if(CkNumPes() == _numPes) {
704                 sprintf(filename,"%s/Chares_%d.dat",dirname,CkMyPe());
705                 FILE* fChares = CmiFopen(filename,"rb");
706                 if(!fChares) CkAbort("Failed to open checkpoint file for chares!");
707                 PUP::fromDisk pChares(fChares);
708                 CkPupChareData(pChares);
709                 CmiFclose(fChares);
710                 _chareRestored = 1;
711         }
712 #endif
713
714         // restore groups
715         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
716         // restore from PE0's copy if shrink/expand
717         if(CkNumPes() != _numPes)
718                 sprintf(filename,"%s/Groups_0.dat",dirname);
719         else
720                 sprintf(filename,"%s/Groups_%d.dat",dirname,CkMyPe());
721         FILE* fGroups = CmiFopen(filename,"rb");
722         if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
723         PUP::fromDisk pGroups(fGroups);
724 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
725     CkPupGroupData(pGroups,CmiTrue);
726 #else
727     CkPupGroupData(pGroups);
728 #endif
729         CmiFclose(fGroups);
730
731         // restore nodegroups
732         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
733         if(CkMyRank()==0){
734                 if(CkNumPes() != _numPes)
735                         sprintf(filename,"%s/NodeGroups_0.dat",dirname);
736                 else
737                         sprintf(filename,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
738                 FILE* fNodeGroups = CmiFopen(filename,"rb");
739                 if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
740                 PUP::fromDisk pNodeGroups(fNodeGroups);
741 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
742         CkPupNodeGroupData(pNodeGroups,CmiTrue);
743 #else
744         CkPupNodeGroupData(pNodeGroups);
745 #endif
746                 CmiFclose(fNodeGroups);
747         }
748
749         // for each location, restore arrays
750         //DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
751         DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
752         if(CkMyPe() < _numPes)  // in normal range: restore, otherwise, do nothing
753           for (i=0; i<_numPes;i++) {
754             if (i%CkNumPes() == CkMyPe()) {
755               sprintf(filename,"%s/arr_%d.dat",dirname, i);
756               FILE *datFile=CmiFopen(filename,"rb");
757               if (datFile==NULL) CkAbort("Could not read data file");
758               PUP::fromDisk  p(datFile);
759               CkPupArrayElementsData(p);
760               CmiFclose(datFile);
761             }
762           }
763
764         _inrestart = 0;
765
766         _initDone();
767
768         if(CkMyPe()==0) {
769                 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
770                 cb.send();
771         }
772 }
773
774 // Main chare: initialize system checkpoint manager
775 class CkCheckpointInit : public Chare {
776 public:
777   CkCheckpointInit(CkArgMsg *msg) {
778     _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
779     delete msg;
780   }
781   CkCheckpointInit(CkMigrateMessage *m) {delete m;}
782 };
783
784 #include "CkCheckpoint.def.h"
785