332c71133834e15720388f4d070f404ffd805690
[charm.git] / src / ck-core / ckcheckpoint.C
1 /*
2 Charm++ File: Checkpoint Library
3 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu
4
5 More documentation goes here...
6 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu
7     see ckcheckpoint.h for change log
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include "charm++.h"
14 #include "ck.h"
15 #include "ckcheckpoint.h"
16
17 void noopit(const char*, ...)
18 {}
19
20 //#define DEBCHK   CkPrintf
21 #define DEBCHK noopit
22
23 #define DEBUGC(x) x
24 //#define DEBUGC(x) 
25
26 CkGroupID _sysChkptMgr;
27
28 typedef struct _GroupInfo{
29         CkGroupID gID;
30         int MigCtor, DefCtor;
31         char name[256];
32 } GroupInfo;
33 PUPbytes(GroupInfo)
34 PUPmarshall(GroupInfo)
35
36 int _inrestart = 0;
37 int _restarted = 0;
38 int _oldNumPes = 0;
39 int _chareRestored = 0;
40
41 void CkCreateLocalChare(int epIdx, envelope *env);
42
43 // help class to find how many array elements
44 class ElementCounter : public CkLocIterator {
45 private:
46         int count;
47 public:
48         ElementCounter():count(0){};
49         void addLocation(CkLocation &loc)  { count++; }
50         int getCount() { return count; }
51 };
52
53 // helper class to pup all elements that belong to same ckLocMgr
54 class ElementCheckpointer : public CkLocIterator {
55 private:
56         CkLocMgr *locMgr;
57         PUP::er &p;
58 public:
59         ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
60         void addLocation(CkLocation &loc) {
61                 CkArrayIndex idx=loc.getIndex();
62                 CkGroupID gID = locMgr->ckGetGroupID();
63                 p|gID;      // store loc mgr's GID as well for easier restore
64                 p|idx;
65                 p|loc;
66                 //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
67         }
68 };
69
70
71 extern void _initDone();
72
73 static void bdcastRO(void){
74         int i;
75         //Determine the size of the RODataMessage
76         PUP::sizer ps;
77         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
78
79         //Allocate and fill out the RODataMessage
80         envelope *env = _allocEnv(RODataMsg, ps.size());
81         PUP::toMem pp((char *)EnvToUsr(env));
82         for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
83         
84         env->setCount(++_numInitMsgs);
85         env->setSrcPe(CkMyPe());
86         CmiSetHandler(env, _roRestartHandlerIdx);
87         CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
88 }
89
90 // Print out an array index to this string as decimal fields
91 // separated by underscores.
92 void printIndex(const CkArrayIndex &idx,char *dest) {
93         const int *idxData=idx.data();
94         for (int i=0;i<idx.nInts;i++) {
95                 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
96                 dest+=strlen(dest);
97         }
98 }
99
100 static void checkpointOne(const char* dirname, CkCallback& cb);
101
102 // broadcast
103 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
104         chkptStartTimer = CmiWallTimer();
105         // every body make dir in case it is local directory
106         CmiMkdir(dirname);
107
108         if (CkMyPe() == 0) {
109           checkpointOne(dirname, cb);
110         }
111
112         char fileName[1024];
113
114 #ifndef CMK_CHARE_USE_PTR
115         // save groups into Chares.dat
116         sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
117         FILE* fChares = CmiFopen(fileName,"wb");
118         if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
119         PUP::toDisk pChares(fChares);
120         CkPupChareData(pChares);
121         CmiFclose(fChares);
122 #endif
123
124         // save groups into Groups.dat
125         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
126         sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
127         FILE* fGroups = CmiFopen(fileName,"wb");
128         if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
129         PUP::toDisk pGroups(fGroups);
130 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
131     CkPupGroupData(pGroups,CmiTrue);
132 #else
133     CkPupGroupData(pGroups);
134 #endif
135         CmiFclose(fGroups);
136
137         // save nodegroups into NodeGroups.dat
138         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
139         if (CkMyRank() == 0) {
140           sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
141           FILE* fNodeGroups = CmiFopen(fileName,"wb");
142           if(!fNodeGroups) 
143             CkAbort("Failed to create checkpoint file for nodegroup table!");
144           PUP::toDisk pNodeGroups(fNodeGroups);
145 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
146       CkPupNodeGroupData(pNodeGroups,CmiTrue);
147 #else
148       CkPupNodeGroupData(pNodeGroups);
149 #endif
150           CmiFclose(fNodeGroups);
151         }
152
153         //DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
154         sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
155         FILE *datFile=CmiFopen(fileName,"wb");
156         if (datFile==NULL) CkAbort("Could not create data file");
157         PUP::toDisk  p(datFile);
158         CkPupArrayElementsData(p);
159         CmiFclose(datFile);
160
161 #if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
162         system("sync");
163 #endif
164
165         restartCB = cb;
166         DEBCHK("[%d]restartCB installed\n",CkMyPe());
167         CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
168         //contribute(0,NULL,CkReduction::sum_int,localcb);
169         barrier(localcb);
170 }
171
172 void CkCheckpointMgr::SendRestartCB(CkReductionMsg *m){ 
173         delete m; 
174         DEBCHK("[%d]Sending out the cb\n",CkMyPe());
175         CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
176         restartCB.send(); 
177 }
178
179 void CkPupROData(PUP::er &p)
180 {
181         int _numReadonlies;
182         if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
183         p|_numReadonlies;
184         if (p.isUnpacking()) {
185           if (_numReadonlies != _readonlyTable.size())
186             CkAbort("You cannot add readonlies and restore from checkpoint...");
187         }
188         for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
189 }
190
191 // handle main chare
192 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
193 {
194         int nMains=_mainTable.size();
195         DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
196         for(int i=0;i<nMains;i++){  /* Create all mainchares */
197                 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
198                 int entryMigCtor = entry->getMigCtor();
199                 if(entryMigCtor!=-1) {
200                         Chare* obj;
201                         if (p.isUnpacking()) {
202                                 int size = entry->size;
203                                 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
204                                 obj = (Chare*)malloc(size);
205                                 _MEMCHECK(obj);
206                                 _mainTable[i]->setObj(obj);
207                                 //void *m = CkAllocSysMsg();
208                                 _entryTable[entryMigCtor]->call(args, obj);
209                         }
210                         else 
211                                 obj = (Chare *)_mainTable[i]->getObj();
212                         obj->pup(p);
213                 }
214         }
215         // to update mainchare proxy
216         // only readonly variables of Chare Proxy is taken care of here;
217         // in general, if chare proxy is contained in some data structure
218         // for example CkCallback, it is user's responsibility to
219         // update them after restarting
220         if (p.isUnpacking() && CkMyPe()==0)
221                 bdcastRO();
222 }
223
224 #ifndef CMK_CHARE_USE_PTR
225
226 CkpvExtern(CkVec<void *>, chare_objs);
227 CkpvExtern(CkVec<int>, chare_types);
228 CkpvExtern(CkVec<VidBlock *>, vidblocks);
229
230 // handle plain non-migratable chare
231 void CkPupChareData(PUP::er &p)
232 {
233   int i, n;
234   if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
235   p|n;
236   for (i=0; i<n; i++) {
237         int chare_type;
238         if (!p.isUnpacking()) {
239                 chare_type = CkpvAccess(chare_types)[i];
240         }
241         p | chare_type;
242         if (p.isUnpacking()) {
243                 int migCtor = _chareTable[chare_type]->migCtor;
244                 if(migCtor==-1) {
245                         char buf[512];
246                         sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
247                         CkAbort(buf);
248                 }
249                 void *m = CkAllocSysMsg();
250                 envelope* env = UsrToEnv((CkMessage *)m);
251                 CkCreateLocalChare(migCtor, env);
252                 CkFreeSysMsg(m);
253         }
254         Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
255         obj->pup(p);
256   }
257
258   if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
259   p|n;
260   for (i=0; i<n; i++) {
261         VidBlock *v;
262         if (p.isUnpacking()) {
263                 v = new VidBlock();
264                 CkpvAccess(vidblocks).push_back(v);
265         }
266         else
267                 v = CkpvAccess(vidblocks)[i];
268         v->pup(p);
269   }
270 }
271 #else
272 void CkPupChareData(PUP::er &p)
273 {
274    // not implemented
275 }
276 #endif
277
278 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
279 // handle GroupTable and data
280 void CkPupGroupData(PUP::er &p, CmiBool create)
281 {
282         int numGroups, i;
283
284         if (!p.isUnpacking()) {
285           numGroups = CkpvAccess(_groupIDTable)->size();
286         }
287         p|numGroups;
288         if (p.isUnpacking()) {
289           if(CkMyPe()==0)  
290             CkpvAccess(_numGroups) = numGroups+1; 
291           else 
292             CkpvAccess(_numGroups) = 1;
293         }
294         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
295
296         GroupInfo *tmpInfo = new GroupInfo [numGroups];
297         if (!p.isUnpacking()) {
298           for(i=0;i<numGroups;i++) {
299                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
300                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
301                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
302                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
303                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
304                 //CkPrintf("[%d] CkPupGroupData: %s group %s \n", CkMyPe(), p.typeString(), tmpInfo[i].name);
305
306                 if(tmpInfo[i].MigCtor==-1) {
307                         char buf[512];
308                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
309                         CkAbort(buf);
310                 }
311           }
312         }
313         for (i=0; i<numGroups; i++) p|tmpInfo[i];
314
315         for(i=0;i<numGroups;i++) 
316         {
317           CkGroupID gID = tmpInfo[i].gID;
318           if (p.isUnpacking()) {
319             //CkpvAccess(_groupIDTable)->push_back(gID);
320             int eIdx = tmpInfo[i].MigCtor;
321             // error checking
322             if (eIdx == -1) {
323               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
324             }
325             void *m = CkAllocSysMsg();
326             envelope* env = UsrToEnv((CkMessage *)m);
327                 if(create)
328                     CkCreateLocalGroup(gID, eIdx, env);
329           }   // end of unPacking
330           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
331           // if using migration constructor, you'd better have a pup
332                 if(!create)
333                         gobj->mlogData->teamRecoveryFlag = 1;
334           gobj->pup(p);
335          // CkPrintf("Group PUP'ed: gid = %d, name = %s\n",gobj->ckGetGroupID().idx, tmpInfo[i].name);
336         }
337         delete [] tmpInfo;
338 }
339
340 // handle NodeGroupTable and data
341 void CkPupNodeGroupData(PUP::er &p, CmiBool create)
342 {
343         int numNodeGroups, i;
344         if (!p.isUnpacking()) {
345           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
346         }
347         p|numNodeGroups;
348         if (p.isUnpacking()) {
349           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
350           else { CksvAccess(_numNodeGroups) = 1; }
351         }
352
353         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
354         if (!p.isUnpacking()) {
355           for(i=0;i<numNodeGroups;i++) {
356                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
357                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
358                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
359                 if(tmpInfo[i].MigCtor==-1) {
360                         char buf[512];
361                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
362                                      declared as [migratable] in .ci to be able to checkpoint.",\
363                                      _chareTable[ent2.getcIdx()]->name);
364                         CkAbort(buf);
365                 }
366           }
367         }
368         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
369         for (i=0;i<numNodeGroups;i++) {
370                 CkGroupID gID = tmpInfo[i].gID;
371                 if (p.isUnpacking()) {
372                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
373                         int eIdx = tmpInfo[i].MigCtor;
374                         void *m = CkAllocSysMsg();
375                         envelope* env = UsrToEnv((CkMessage *)m);
376                         if(create){
377                                 CkCreateLocalNodeGroup(gID, eIdx, env);
378                         }
379                 }
380                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
381                 IrrGroup *obj = ent2.getObj();
382                 obj->pup(p);
383         }
384         delete [] tmpInfo;
385 }
386 #else
387 // handle GroupTable and data
388 void CkPupGroupData(PUP::er &p)
389 {
390         int numGroups, i;
391
392         if (!p.isUnpacking()) {
393           numGroups = CkpvAccess(_groupIDTable)->size();
394         }
395         p|numGroups;
396         if (p.isUnpacking()) {
397           if(CkMyPe()==0)  
398             CkpvAccess(_numGroups) = numGroups+1; 
399           else 
400             CkpvAccess(_numGroups) = 1;
401         }
402         DEBCHK("[%d] CkPupGroupData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
403
404         GroupInfo *tmpInfo = new GroupInfo [numGroups];
405         if (!p.isUnpacking()) {
406           for(i=0;i<numGroups;i++) {
407                 tmpInfo[i].gID = (*CkpvAccess(_groupIDTable))[i];
408                 TableEntry ent = CkpvAccess(_groupTable)->find(tmpInfo[i].gID);
409                 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
410                 tmpInfo[i].DefCtor = _chareTable[ent.getcIdx()]->defCtor;
411                 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
412                 DEBCHK("[%d] CkPupGroupData: %s group %s \n",
413                         CkMyPe(), p.typeString(), tmpInfo[i].name);
414
415                 if(tmpInfo[i].MigCtor==-1) {
416                         char buf[512];
417                         sprintf(buf,"Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
418                         CkAbort(buf);
419                 }
420           }
421         }
422         for (i=0; i<numGroups; i++) p|tmpInfo[i];
423
424         for(i=0;i<numGroups;i++) 
425         {
426           CkGroupID gID = tmpInfo[i].gID;
427           if (p.isUnpacking()) {
428             //CkpvAccess(_groupIDTable)->push_back(gID);
429             int eIdx = tmpInfo[i].MigCtor;
430             // error checking
431             if (eIdx == -1) {
432               CkPrintf("[%d] ERROR> Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name); CkAbort("Abort");
433             }
434             void *m = CkAllocSysMsg();
435             envelope* env = UsrToEnv((CkMessage *)m);
436             CkCreateLocalGroup(gID, eIdx, env);
437           }   // end of unPacking
438           IrrGroup *gobj = CkpvAccess(_groupTable)->find(gID).getObj();
439           // if using migration constructor, you'd better have a pup
440           gobj->pup(p);
441           DEBCHK("Group PUP'ed: gid = %d, name = %s\n",
442                         gobj->ckGetGroupID().idx, tmpInfo[i].name);
443       //   CkPrintf("Group PUP'ed: gid = %d, name = %s\n",
444                         //gobj->ckGetGroupID().idx, tmpInfo[i].name);
445         }
446         delete [] tmpInfo;
447 }
448
449 // handle NodeGroupTable and data
450 void CkPupNodeGroupData(PUP::er &p)
451 {
452         int numNodeGroups, i;
453         if (!p.isUnpacking()) {
454           numNodeGroups = CksvAccess(_nodeGroupIDTable).size();
455         }
456         p|numNodeGroups;
457         if (p.isUnpacking()) {
458           if(CkMyPe()==0){ CksvAccess(_numNodeGroups) = numNodeGroups+1; }
459           else { CksvAccess(_numNodeGroups) = 1; }
460         }
461         DEBCHK("[%d] CkPupNodeGroupData %s: numNodeGroups = %d\n",CkMyPe(),p.typeString(),numNodeGroups);
462
463         GroupInfo *tmpInfo = new GroupInfo [numNodeGroups];
464         if (!p.isUnpacking()) {
465           for(i=0;i<numNodeGroups;i++) {
466                 tmpInfo[i].gID = CksvAccess(_nodeGroupIDTable)[i];
467                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(tmpInfo[i].gID);
468                 tmpInfo[i].MigCtor = _chareTable[ent2.getcIdx()]->migCtor;
469                 if(tmpInfo[i].MigCtor==-1) {
470                         char buf[512];
471                         sprintf(buf,"NodeGroup %s either need a migration constructor and\n\
472                                      declared as [migratable] in .ci to be able to checkpoint.",\
473                                      _chareTable[ent2.getcIdx()]->name);
474                         CkAbort(buf);
475                 }
476           }
477         }
478         for (i=0; i<numNodeGroups; i++) p|tmpInfo[i];
479         for (i=0;i<numNodeGroups;i++) {
480                 CkGroupID gID = tmpInfo[i].gID;
481                 if (p.isUnpacking()) {
482                         //CksvAccess(_nodeGroupIDTable).push_back(gID);
483                         int eIdx = tmpInfo[i].MigCtor;
484                         void *m = CkAllocSysMsg();
485                         envelope* env = UsrToEnv((CkMessage *)m);
486                         CkCreateLocalNodeGroup(gID, eIdx, env);
487                 }
488                 TableEntry ent2 = CksvAccess(_nodeGroupTable)->find(gID);
489                 IrrGroup *obj = ent2.getObj();
490                 obj->pup(p);
491                 DEBCHK("Nodegroup PUP'ed: gid = %d, name = %s\n",
492                         obj->ckGetGroupID().idx,
493                         _chareTable[ent2.getcIdx()]->name);
494         }
495         delete [] tmpInfo;
496 }
497 #endif
498
499 // handle chare array elements for this processor
500 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
501 {
502         int i;
503         // safe in both packing/unpakcing at this stage
504         int numGroups = CkpvAccess(_groupIDTable)->size();
505
506         // number of array elements on this processor
507         int numElements;
508         if (!p.isUnpacking()) {
509           ElementCounter  counter;
510           CKLOCMGR_LOOP(mgr->iterate(counter););
511           numElements = counter.getCount();
512         }
513         p|numElements;
514
515         DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
516
517         if (!p.isUnpacking())
518         {
519           // let CkLocMgr to iterate and store every array elements
520           CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
521         }
522         else {
523           // loop and create all array elements ourselves
524           //CkPrintf("total chare array cnts: %d\n", numElements);
525           for (int i=0; i<numElements; i++) {
526                 CkGroupID gID;
527                 CkArrayIndex idx;
528                 p|gID;
529                 p|idx;
530                 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
531                 if (notifyListeners){
532                   mgr->resume(idx,p,CmiTrue);
533                 }
534                 else{
535                   mgr->restore(idx,p);
536                 }
537           }
538         }
539         // finish up
540         if (notifyListeners)
541         for(i=0;i<numGroups;i++) {
542                 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
543                 obj->ckJustMigrated();
544         }
545 }
546
547 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) ||CMK_MEM_CHECKPOINT
548 int  CkCountArrayElements(){
549     int numGroups = CkpvAccess(_groupIDTable)->size();
550     int i;
551     ElementCounter  counter;
552     CKLOCMGR_LOOP(mgr->iterate(counter););
553   int numElements = counter.getCount();
554     return numElements;
555 }
556 #endif
557
558 void CkPupProcessorData(PUP::er &p)
559 {
560     // save readonlys, and callback BTW
561     if(CkMyRank()==0) {
562         CkPupROData(p);
563     }
564
565     // save mainchares into MainChares.dat
566     if(CkMyPe()==0) {
567       CkPupMainChareData(p, NULL);
568     }
569         
570     // save non-migratable chare
571     CkPupChareData(p);
572
573     // save groups 
574 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
575     CkPupGroupData(p,CmiTrue);
576 #else
577     CkPupGroupData(p);
578 #endif
579
580     // save nodegroups
581     if(CkMyRank()==0) {
582 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
583         CkPupNodeGroupData(p,CmiTrue);  
584 #else
585         CkPupNodeGroupData(p);
586 #endif
587     }
588
589     // pup array elements
590     CkPupArrayElementsData(p);
591 }
592
593 // called only on pe 0
594 static void checkpointOne(const char* dirname, CkCallback& cb){
595         CmiAssert(CkMyPe()==0);
596         char filename[1024];
597         
598         // save readonlys, and callback BTW
599         sprintf(filename,"%s/RO.dat",dirname);
600         FILE* fRO = CmiFopen(filename,"wb");
601         if(!fRO) CkAbort("Failed to create checkpoint file for readonly data!");
602         PUP::toDisk pRO(fRO);
603         int _numPes = CkNumPes();
604         pRO|_numPes;
605         CkPupROData(pRO);
606         pRO|cb;
607         CmiFclose(fRO);
608
609         // save mainchares into MainChares.dat
610         {
611                 sprintf(filename,"%s/MainChares.dat",dirname);
612                 FILE* fMain = CmiFopen(filename,"wb");
613                 if(!fMain) CkAbort("Failed to open checkpoint file for mainchare data!");
614                 PUP::toDisk pMain(fMain);
615                 CkPupMainChareData(pMain, NULL);
616                 CmiFclose(fMain);
617         }
618 }
619
620 void CkRemoveArrayElements()
621 {
622   int i;
623   int numGroups = CkpvAccess(_groupIDTable)->size();
624   CKLOCMGR_LOOP(mgr->flushAllRecs(););
625 /*  GroupTable *gTbl = CkpvAccess(_groupTable);
626   for(i=0; i<numGroups; i++){
627     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
628     if(obj->isLocMgr()) {
629         CkLocMgr *mgr = (CkLocMgr *)obj;
630         mgr->flushAllRecs();
631     }
632   }*/
633 }
634
635 /*
636 void CkTestArrayElements()
637 {
638   int i;
639   int numGroups = CkpvAccess(_groupIDTable)->size();
640   //CKLOCMGR_LOOP(mgr->flushAllRecs(););
641   GroupTable *gTbl = CkpvAccess(_groupTable);
642   for(i=0; i<numGroups; i++){
643     IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
644     CkPrintf("An object at [%d]: %p | isLocMgr: %d\n", i, obj, obj->isLocMgr());
645   }
646 }
647 */
648
649 void CkStartCheckpoint(const char* dirname,const CkCallback& cb)
650 {
651         CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
652         
653         // hand over to checkpoint managers for per-processor checkpointing
654         CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb);
655 }
656
657 /**
658   * Restart: There's no such object as restart manager is created
659   *          because a group cannot restore itself anyway.
660   *          The mechanism exists as converse code and get invoked by
661   *          broadcast message.
662   **/
663
664 void CkRestartMain(const char* dirname, CkArgMsg *args){
665         int i;
666         char filename[1024];
667         CkCallback cb;
668         
669         _inrestart = 1;
670         _restarted = 1;
671         CkMemCheckPT::inRestarting = 1;
672
673         // restore readonlys
674         sprintf(filename,"%s/RO.dat",dirname);
675         FILE* fRO = CmiFopen(filename,"rb");
676         if(!fRO) CkAbort("Failed to open checkpoint file for readonly data!");
677         int _numPes = -1;
678         PUP::fromDisk pRO(fRO);
679         pRO|_numPes;
680         CkPupROData(pRO);
681         pRO|cb;
682         CmiFclose(fRO);
683         DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
684         _oldNumPes = _numPes;
685
686         CmiNodeBarrier();
687
688         // restore mainchares
689         sprintf(filename,"%s/MainChares.dat",dirname);
690         FILE* fMain = CmiFopen(filename,"rb");
691         if(fMain && CkMyPe()==0){ // only main chares have been checkpointed, we restart on PE0
692                 PUP::fromDisk pMain(fMain);
693                 CkPupMainChareData(pMain, args);
694                 CmiFclose(fMain);
695                 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
696                 //bdcastRO(); // moved to CkPupMainChareData()
697         }
698         
699 #ifndef CMK_CHARE_USE_PTR
700         // restore chares only when number of pes is the same 
701         if(CkNumPes() == _numPes) {
702                 sprintf(filename,"%s/Chares_%d.dat",dirname,CkMyPe());
703                 FILE* fChares = CmiFopen(filename,"rb");
704                 if(!fChares) CkAbort("Failed to open checkpoint file for chares!");
705                 PUP::fromDisk pChares(fChares);
706                 CkPupChareData(pChares);
707                 CmiFclose(fChares);
708                 _chareRestored = 1;
709         }
710 #endif
711
712         // restore groups
713         // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
714         // restore from PE0's copy if shrink/expand
715         if(CkNumPes() != _numPes)
716                 sprintf(filename,"%s/Groups_0.dat",dirname);
717         else
718                 sprintf(filename,"%s/Groups_%d.dat",dirname,CkMyPe());
719         FILE* fGroups = CmiFopen(filename,"rb");
720         if(!fGroups) CkAbort("Failed to open checkpoint file for group table!");
721         PUP::fromDisk pGroups(fGroups);
722 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
723     CkPupGroupData(pGroups,CmiTrue);
724 #else
725     CkPupGroupData(pGroups);
726 #endif
727         CmiFclose(fGroups);
728
729         // restore nodegroups
730         // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
731         if(CkMyRank()==0){
732                 if(CkNumPes() != _numPes)
733                         sprintf(filename,"%s/NodeGroups_0.dat",dirname);
734                 else
735                         sprintf(filename,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
736                 FILE* fNodeGroups = CmiFopen(filename,"rb");
737                 if(!fNodeGroups) CkAbort("Failed to open checkpoint file for nodegroup table!");
738                 PUP::fromDisk pNodeGroups(fNodeGroups);
739 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
740         CkPupNodeGroupData(pNodeGroups,CmiTrue);
741 #else
742         CkPupNodeGroupData(pNodeGroups);
743 #endif
744                 CmiFclose(fNodeGroups);
745         }
746
747         // for each location, restore arrays
748         //DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
749         DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
750         if(CkMyPe() < _numPes)  // in normal range: restore, otherwise, do nothing
751           for (i=0; i<_numPes;i++) {
752             if (i%CkNumPes() == CkMyPe()) {
753               sprintf(filename,"%s/arr_%d.dat",dirname, i);
754               FILE *datFile=CmiFopen(filename,"rb");
755               if (datFile==NULL) CkAbort("Could not read data file");
756               PUP::fromDisk  p(datFile);
757               CkPupArrayElementsData(p);
758               CmiFclose(datFile);
759             }
760           }
761
762         _inrestart = 0;
763
764         _initDone();
765         CkMemCheckPT::inRestarting = 0;
766         if(CkMyPe()==0) {
767                 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
768                 
769                 cb.send();
770         }
771 }
772
773 // Main chare: initialize system checkpoint manager
774 class CkCheckpointInit : public Chare {
775 public:
776   CkCheckpointInit(CkArgMsg *msg) {
777     _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
778     delete msg;
779   }
780   CkCheckpointInit(CkMigrateMessage *m) {delete m;}
781 };
782
783 #include "CkCheckpoint.def.h"
784