changa  3.5
 All Classes Files Functions Variables Typedefs Enumerations Friends Macros Groups Pages
Orb3dLBCommon.h
1 //Abhishek - Extract common code from Orb and mslb_notopo
2 #ifndef _ORB3DHELPER_H
3 #define _ORB3DHELPER_H
4 
5 #include <charm++.h>
6 #include <charm++.h>
7 #include "cklists.h"
8 #include "ParallelGravity.h"
9 #include "TopoManager.h"
10 
11 #include "Refiner.h"
12 #include "MapStructures.h"
13 #include "TaggedVector3D.h"
14 #include "Vector3D.h"
15 #include "CentralLB.h"
16 #define ORB3DLB_NOTOPO_DEBUG(X)
17 // #define ORB3DLB_NOTOPO_DEBUG(X) CkPrintf X
18 
20 class PeInfo {
21  public:
22  int idx;
23  double load;
24  double items;
25  PeInfo(int id, double ld, int it) : idx(id), load(ld), items(it) {}
26 };
27 
30  public:
31  bool operator()(PeInfo& p1, PeInfo& p2) {
32  // This can be done based on load or number of tps assigned to a PE
33  return (p1.load > p2.load);
34  }
35 };
36 
39  // pointer to stats->to_proc
40  protected:
41  decltype(BaseLB::LDStats::to_proc) *mapping;
42  decltype(BaseLB::LDStats::from_proc) *from;
43 
44  CkVec<float> procload;
45 
48  double maxPieceProc;
49 
51  int nextProc;
52 
53  // Greedy strategy to assign TreePieces to PEs on a node.
54  void orbPePartition(vector<Event> *events, vector<OrbObject> &tp, int node,
55  BaseLB::LDStats *stats) {
56 
57  std::vector<PeInfo> peinfo;
58  float totalLoad = 0.0;
59  int firstProc = CkNodeFirst(node);
60  int lastProc = firstProc + CkNodeSize(node) - 1;
61  for (int i = firstProc; i <= lastProc; i++) {
62  peinfo.push_back(PeInfo(i, 0.0, 0));
63  }
64  // Make a heap of processors belonging to this node
65  std::make_heap(peinfo.begin(), peinfo.end(), ProcLdGreater());
66 
67  int nextProc;
68  for(int i = 0; i < events[XDIM].size(); i++){
69  Event &ev = events[XDIM][i];
70  OrbObject &orb = tp[ev.owner];
71 
72  // Pop the least loaded PE from the heap and assign TreePiece to it
73  PeInfo p = peinfo.front();
74  pop_heap(peinfo.begin(), peinfo.end(), ProcLdGreater());
75  peinfo.pop_back();
76 
77  nextProc = p.idx;
78 
79  if(orb.numParticles > 0){
80  (*mapping)[orb.lbindex] = nextProc;
81  procload[nextProc] += ev.load;
82  p.load += ev.load;
83  p.items += 1;
84  totalLoad += ev.load;
85  } else{
86  int fromPE = (*from)[orb.lbindex];
87  procload[fromPE] += ev.load;
88  }
89 
90  peinfo.push_back(p);
91  push_heap(peinfo.begin(), peinfo.end(), ProcLdGreater());
92  }
93  }
94 
103  void orbPartition(vector<Event> *events, OrientedBox<float> &box, int nprocs,
104  vector<OrbObject> & tp, BaseLB::LDStats *stats,
105  bool node_partition=false){
106 
107  ORB3DLB_NOTOPO_DEBUG(("partition events %d %d %d nprocs %d\n",
108  events[XDIM].size(),
109  events[YDIM].size(),
110  events[ZDIM].size(),
111  nprocs
112  ));
113  int numEvents = events[XDIM].size();
114  CkAssert(numEvents == events[YDIM].size());
115  CkAssert(numEvents == events[ZDIM].size());
116 
117  if(numEvents == 0)
118  return;
119 
120  if(nprocs == 1){
121  ORB3DLB_NOTOPO_DEBUG(("base: assign %d tps to proc %d\n", numEvents, nextProc));
122  if (!stats->procs[nextProc].available) {
123  nextProc++;
124  return;
125  }
126 
127  // If we are doing orb partition at the node level, then call
128  // orbPePartition to assign the treepieces to the PEs belonging to the node.
129  if (node_partition) {
130  orbPePartition(events, tp, nextProc, stats);
131  } else {
132  // direct assignment of tree pieces to processors
133  //if(numEvents > 0) CkAssert(nprocs != 0);
134  float totalLoad = 0.0;
135  for(int i = 0; i < events[XDIM].size(); i++){
136  Event &ev = events[XDIM][i];
137  OrbObject &orb = tp[ev.owner];
138  if(orb.numParticles > 0){
139  (*mapping)[orb.lbindex] = nextProc;
140  totalLoad += ev.load;
141  }
142  else{
143  int fromPE = (*from)[orb.lbindex];
144  if (fromPE < 0 || fromPE >= procload.size()) {
145  CkPrintf("[%d] trying to access fromPe %d nprocs %lu\n", CkMyPe(), fromPE, procload.size());
146  CkAbort("Trying to access a PE which is outside the range\n");
147  }
148  procload[fromPE] += ev.load;
149  }
150  }
151  procload[nextProc] += totalLoad;
152  }
153 
154  if(numEvents > 0) nextProc++;
155  return;
156  }
157 
158  // find longest dimension
159 
160  int longestDim = XDIM;
161  float longestDimLength = box.greater_corner[longestDim] - box.lesser_corner[longestDim];
162  for(int i = YDIM; i <= ZDIM; i++){
163  float thisDimLength = box.greater_corner[i]-box.lesser_corner[i];
164  if(thisDimLength > longestDimLength){
165  longestDimLength = thisDimLength;
166  longestDim = i;
167  }
168  }
169 
170  ORB3DLB_NOTOPO_DEBUG(("dimensions %f %f %f longest %d\n",
171  box.greater_corner[XDIM]-box.lesser_corner[XDIM],
172  box.greater_corner[YDIM]-box.lesser_corner[YDIM],
173  box.greater_corner[ZDIM]-box.lesser_corner[ZDIM],
174  longestDim
175  ));
176 
177  int nlprocs = nprocs/2;
178  int nrprocs = nprocs-nlprocs;
179 
180  float ratio = (1.0*nlprocs)/(1.0*(nlprocs+nrprocs));
181 
182  // sum background load on each side of the processor split
183  float bglprocs = 0.0;
184  for(int np = nextProc; np < nextProc + nlprocs; np++)
185  bglprocs += stats->procs[np].bg_walltime;
186  float bgrprocs = 0.0;
187  for(int np = nextProc + nlprocs; np < nextProc + nlprocs + nrprocs; np++)
188  bgrprocs += stats->procs[np].bg_walltime;
189 
190  ORB3DLB_NOTOPO_DEBUG(("nlprocs %d nrprocs %d ratio %f\n", nlprocs, nrprocs, ratio));
191 
192  int splitIndex = partitionRatioLoad(events[longestDim],ratio,bglprocs,
193  bgrprocs);
194  if(splitIndex == numEvents) {
195  ORB3DLB_NOTOPO_DEBUG(("evenly split 0 load\n"));
196  splitIndex = splitIndex/2;
197  }
198  int nleft = splitIndex;
199  int nright = numEvents-nleft;
200 
201 #if 0
202  if(nright < nrprocs) { // at least one piece per processor
203  nright = nrprocs;
204  nleft = splitIndex = numEvents-nright;
205  CkAssert(nleft >= nlprocs);
206  }
207  else if(nleft < nlprocs) {
208  nleft = splitIndex = nlprocs;
209  nright = numEvents-nleft;
210  CkAssert(nright >= nrprocs);
211  }
212 #endif
213 
214  if(nleft > nlprocs*maxPieceProc) {
215  nleft = splitIndex = (int) (nlprocs*maxPieceProc);
216  nright = numEvents-nleft;
217  }
218  else if (nright > nrprocs*maxPieceProc) {
219  nright = (int) (nrprocs*maxPieceProc);
220  nleft = splitIndex = numEvents-nright;
221  }
222  CkAssert(splitIndex >= 0);
223  CkAssert(splitIndex < numEvents);
224 
225  OrientedBox<float> leftBox;
226  OrientedBox<float> rightBox;
227 
228  leftBox = rightBox = box;
229  float splitPosition = events[longestDim][splitIndex].position;
230  leftBox.greater_corner[longestDim] = splitPosition;
231  rightBox.lesser_corner[longestDim] = splitPosition;
232 
233  // classify events
234  for(int i = 0; i < splitIndex; i++){
235  Event &ev = events[longestDim][i];
236  CkAssert(ev.owner >= 0);
237  CkAssert(tp[ev.owner].partition == INVALID_PARTITION);
238  tp[ev.owner].partition = LEFT_PARTITION;
239  }
240  for(int i = splitIndex; i < numEvents; i++){
241  Event &ev = events[longestDim][i];
242  CkAssert(ev.owner >= 0);
243  CkAssert(tp[ev.owner].partition == INVALID_PARTITION);
244  tp[ev.owner].partition = RIGHT_PARTITION;
245  }
246 
247  vector<Event> leftEvents[NDIMS];
248  vector<Event> rightEvents[NDIMS];
249 
250  for(int i = 0; i < NDIMS; i++){
251  if(i == longestDim){
252  leftEvents[i].resize(nleft);
253  rightEvents[i].resize(nright);
254  }
255  else{
256  leftEvents[i].reserve(nleft);
257  rightEvents[i].reserve(nright);
258  }
259  }
260  // copy events of split dimension
261  memcpy(&leftEvents[longestDim][0],&events[longestDim][0],sizeof(Event)*nleft);
262  memcpy(&rightEvents[longestDim][0],&events[longestDim][splitIndex],sizeof(Event)*nright);
263 
264  // copy events of other dimensions
265  for(int i = XDIM; i <= ZDIM; i++){
266  if(i == longestDim) continue;
267  for(int j = 0; j < numEvents; j++){
268  Event &ev = events[i][j];
269  CkAssert(ev.owner >= 0);
270  OrbObject &orb = tp[ev.owner];
271  CkAssert(orb.partition != INVALID_PARTITION);
272  if(orb.partition == LEFT_PARTITION) leftEvents[i].push_back(ev);
273  else if(orb.partition == RIGHT_PARTITION) rightEvents[i].push_back(ev);
274  }
275  }
276 
277  // cleanup
278  // next, reset the ownership information in the
279  // OrbObjects, so that the next invocation may use
280  // the same locations for its book-keeping
281  vector<Event> &eraseVec = events[longestDim];
282  for(int i = 0; i < numEvents; i++){
283  Event &ev = eraseVec[i];
284  CkAssert(ev.owner >= 0);
285  OrbObject &orb = tp[ev.owner];
286  CkAssert(orb.partition != INVALID_PARTITION);
287  orb.partition = INVALID_PARTITION;
288  }
289 
290  // free events from parent node,
291  // since they are not needed anymore
292  // (we have partition all events into the
293  // left and right event subsets)
294  for(int i = 0; i < NDIMS; i++){
295  //events[i].free();
296  vector<Event>().swap(events[i]);
297  }
298  orbPartition(leftEvents,leftBox,nlprocs,tp, stats, node_partition);
299  orbPartition(rightEvents,rightBox,nrprocs,tp, stats, node_partition);
300  }
301 
308  void orbPrepare(vector<Event> *tpEvents, OrientedBox<float> &box, int
309  numobjs, BaseLB::LDStats * stats, bool node_partition=false){
310 
311  int nmig = stats->n_migrateobjs;
312  if(dMaxBalance < 1.0)
313  dMaxBalance = 1.0;
314 
315  // If using node based orb partition, then the maxPieceProc is total
316  // migratable objs / total number of node.
317  if (node_partition) {
318  maxPieceProc = dMaxBalance * nmig / CkNumNodes();
319  } else {
320  maxPieceProc = dMaxBalance*nmig/stats->nprocs();
321  }
322 
323  if(maxPieceProc < 1.0)
324  maxPieceProc = 1.01;
325 
326  CkAssert(tpEvents[XDIM].size() == numobjs);
327  CkAssert(tpEvents[YDIM].size() == numobjs);
328  CkAssert(tpEvents[ZDIM].size() == numobjs);
329 
330  mapping = &stats->to_proc;
331  from = &stats->from_proc;
332 
333  CkPrintf("[Orb3dLB_notopo] sorting\n");
334  for(int i = 0; i < NDIMS; i++){
335  sort(tpEvents[i].begin(),tpEvents[i].end());
336  }
337 
338  box.lesser_corner.x = tpEvents[XDIM][0].position;
339  box.lesser_corner.y = tpEvents[YDIM][0].position;
340  box.lesser_corner.z = tpEvents[ZDIM][0].position;
341 
342  box.greater_corner.x = tpEvents[XDIM][numobjs-1].position;
343  box.greater_corner.y = tpEvents[YDIM][numobjs-1].position;
344  box.greater_corner.z = tpEvents[ZDIM][numobjs-1].position;
345 
346  nextProc = 0;
347 
348  procload.resize(stats->nprocs());
349  for(int i = 0; i < stats->nprocs(); i++){
350  procload[i] = stats->procs[i].bg_walltime;
351  }
352 
353  }
354 
355  void refine(BaseLB::LDStats *stats, int numobjs){
356 #ifdef DO_REFINE
357  int *from_procs = Refiner::AllocProcs(stats->nprocs(), stats);
358  int *to_procs = Refiner::AllocProcs(stats->nprocs(), stats);
359 #endif
360 
361  int migr = 0;
362  for(int i = 0; i < numobjs; i++){
363  if(stats->to_proc[i] != stats->from_proc[i]) migr++;
364 #ifdef DO_REFINE
365  int pe = stats->to_proc[i];
366  from_procs[i] = pe;
367  to_procs[i] = pe;
368 #endif
369  }
370 
371  int numRefineMigrated = 0;
372 #ifdef DO_REFINE
373  CkPrintf("[orb3dlb_notopo] refine\n");
374  Refiner refiner(1.050);
375  refiner.Refine(stats->nprocs(),stats,from_procs,to_procs);
376 
377  for(int i = 0; i < numobjs; i++){
378  if(to_procs[i] != from_procs[i]) numRefineMigrated++;
379  stats->to_proc[i] = to_procs[i];
380  }
381 #endif
382 
383  double *predLoad = new double[stats->nprocs()];
384  double *predCount = new double[stats->nprocs()];
385  for(int i = 0; i < stats->nprocs(); i++){
386  predLoad[i] = 0.0;
387  predCount[i] = 0.0;
388  }
389 
390  double maxObjLoad = 0.0;
391 
392  for(int i = 0; i < numobjs; i++){
393  double ld = stats->objData[i].wallTime;
394  int proc = stats->to_proc[i];
395  predLoad[proc] += ld;
396  predCount[proc] += 1.0;
397  if(ld > maxObjLoad)
398  maxObjLoad = ld;
399  }
400 
401  double minWall = 0.0;
402  double maxWall = 0.0;
403  double avgWall = 0.0;
404 
405  double minIdle = 0.0;
406  double maxIdle = 0.0;
407  double avgIdle = 0.0;
408 
409  double minBg = 0.0;
410  double maxBg = 0.0;
411  double avgBg = 0.0;
412 
413  double avgPred = 0.0;
414  double minPred = 0.0;
415  double maxPred = 0.0;
416 
417  double avgPiece = 0.0;
418  double minPiece = 0.0;
419  double maxPiece = 0.0;
420 
421  CkPrintf("***************************\n");
422  // CkPrintf("Before LB step %d\n", step());
423  for(int i = 0; i < stats->nprocs(); i++){
424  double wallTime = stats->procs[i].total_walltime;
425  double idleTime = stats->procs[i].idletime;
426  double bgTime = stats->procs[i].bg_walltime;
427  double pred = predLoad[i];
428  double npiece = predCount[i];
429  /*
430  CkPrintf("[pestats] %d %d %f %f %f %f\n",
431  i,
432  stats->procs[i].pe,
433  wallTime,
434  idleTime,
435  bgTime,
436  objTime);
437  */
438 
439  avgWall += wallTime;
440  avgIdle += idleTime;
441  avgBg += bgTime;
442  avgPred += pred;
443  avgPiece += npiece;
444 
445  if(i==0 || minWall > wallTime) minWall = wallTime;
446  if(i==0 || maxWall < wallTime) maxWall = wallTime;
447 
448  if(i==0 || minIdle > idleTime) minIdle = idleTime;
449  if(i==0 || maxIdle < idleTime) maxIdle = idleTime;
450 
451  if(i==0 || minBg > bgTime) minBg = bgTime;
452  if(i==0 || maxBg < bgTime) maxBg = bgTime;
453 
454  if(i==0 || minPred > pred) minPred = pred;
455  if(i==0 || maxPred < pred) maxPred = pred;
456 
457  if(i==0 || minPiece > npiece) minPiece = npiece;
458  if(i==0 || maxPiece < npiece) maxPiece = npiece;
459 
460  }
461 
462  avgWall /= stats->nprocs();
463  avgIdle /= stats->nprocs();
464  avgBg /= stats->nprocs();
465  avgPred /= stats->nprocs();
466  avgPiece /= stats->nprocs();
467 
468 #ifdef PRINT_LOAD_PERCENTILES
469  double accumVar = 0;
470  vector<double> objectWallTimes;
471  for(int i = 0; i < stats->nprocs(); i++){
472  double wallTime = stats->procs[i].total_walltime;
473  objectWallTimes.push_back(wallTime);
474  accumVar += (wallTime - avgWall) * (wallTime - avgWall);
475  }
476  double stdDev = sqrt(accumVar / stats->nprocs());
477  CkPrintf("Average load: %.3f\n", avgWall);
478  CkPrintf("Standard deviation: %.3f\n", stdDev);
479 
480  std::sort(objectWallTimes.begin(), objectWallTimes.end());
481  CkPrintf("Object load percentiles: \n");
482  double increment = (double) objectWallTimes.size() / 10;
483  int j = 0;
484  double index = 0;
485  for (int j = 0; j < 100; j += 10) {
486  index += increment;
487  CkPrintf("%d: %.3f\n", j, objectWallTimes[(int) index]);
488  }
489  CkPrintf("100: %.3f\n", objectWallTimes.back());
490 #endif
491 
492  delete[] predLoad;
493  delete[] predCount;
494 
495 #if 0
496  float minload, maxload, avgload;
497  minload = maxload = procload[0];
498  avgload = 0.0;
499  for(int i = 0; i < stats->nprocs(); i++){
500  CkPrintf("pe %d load %f box %f %f %f %f %f %f\n", i, procload[i],
501  procbox[i].lesser_corner.x,
502  procbox[i].lesser_corner.y,
503  procbox[i].lesser_corner.z,
504  procbox[i].greater_corner.x,
505  procbox[i].greater_corner.y,
506  procbox[i].greater_corner.z
507  );
508  avgload += procload[i];
509  if(minload > procload[i]) minload = procload[i];
510  if(maxload < procload[i]) maxload = procload[i];
511  }
512 
513  avgload /= stats->nprocs();
514 
515  CkPrintf("Orb3dLB_notopo stats: min %f max %f avg %f max/avg %f\n", minload, maxload, avgload, maxload/avgload);
516 #endif
517 
518 
519  CkPrintf("Orb3dLB_notopo stats: maxObjLoad %f\n", maxObjLoad);
520  CkPrintf("Orb3dLB_notopo stats: minWall %f maxWall %f avgWall %f maxWall/avgWall %f\n", minWall, maxWall, avgWall, maxWall/avgWall);
521  CkPrintf("Orb3dLB_notopo stats: minIdle %f maxIdle %f avgIdle %f minIdle/avgIdle %f\n", minIdle, maxIdle, avgIdle, minIdle/avgIdle);
522  CkPrintf("Orb3dLB_notopo stats: minPred %f maxPred %f avgPred %f maxPred/avgPred %f\n", minPred, maxPred, avgPred, maxPred/avgPred);
523  CkPrintf("Orb3dLB_notopo stats: minPiece %f maxPiece %f avgPiece %f maxPiece/avgPiece %f\n", minPiece, maxPiece, avgPiece, maxPiece/avgPiece);
524 
525  CkPrintf("Orb3dLB_notopo stats: minBg %f maxBg %f avgBg %f maxBg/avgBg %f\n", minBg, maxBg, avgBg, maxBg/avgBg);
526  CkPrintf("Orb3dLB_notopo stats: orb migrated %d refine migrated %d objects\n", migr, numRefineMigrated);
527 
528 #ifdef DO_REFINE
529  // Free the refine buffers
530  Refiner::FreeProcs(from_procs);
531  Refiner::FreeProcs(to_procs);
532 #endif
533 
534  }
535 
544  int partitionRatioLoad(vector<Event> &events, float ratio, float bglp, float bgrp){
545 
546  float approxBgPerEvent = (bglp + bgrp) / events.size();
547  float totalLoad = bglp + bgrp;
548  for(int i = 0; i < events.size(); i++){
549  totalLoad += events[i].load;
550  }
551  //CkPrintf("************************************************************\n");
552  //CkPrintf("partitionEvenLoad start %d end %d total %f\n", tpstart, tpend, totalLoad);
553  float perfectLoad = ratio * totalLoad;
554  ORB3DLB_NOTOPO_DEBUG(("partitionRatioLoad bgl %f bgr %f\n",
555  bglp, bgrp));
556  int splitIndex = 0;
557  float prevLoad = 0.0;
558  float leftLoadAtSplit = 0.0;
559  for(splitIndex = 0; splitIndex < events.size(); splitIndex++){
560 
561  leftLoadAtSplit += events[splitIndex].load + approxBgPerEvent;
562 
563  if (leftLoadAtSplit > perfectLoad) {
564  if ( fabs(leftLoadAtSplit - perfectLoad) < fabs(prevLoad - perfectLoad) ) {
565  splitIndex++;
566  }
567  else {
568  leftLoadAtSplit = prevLoad;
569  }
570  break;
571  }
572  prevLoad = leftLoadAtSplit;
573  }
574 
575  ORB3DLB_NOTOPO_DEBUG(("partitionEvenLoad mid %d lload %f rload %f ratio %f\n", splitIndex, leftLoadAtSplit, totalLoad - leftLoadAtSplit, leftLoadAtSplit / totalLoad));
576  return splitIndex;
577  }
578 
579 }; //end class
580 
581 #endif
void orbPartition(vector< Event > *events, OrientedBox< float > &box, int nprocs, vector< OrbObject > &tp, BaseLB::LDStats *stats, bool node_partition=false)
Recursively partition treepieces among processors by bisecting the load in orthogonal directions...
Definition: Orb3dLBCommon.h:103
double maxPieceProc
Definition: Orb3dLBCommon.h:48
Common methods among Orb3d class load balancers.
Definition: Orb3dLBCommon.h:38
int partitionRatioLoad(vector< Event > &events, float ratio, float bglp, float bgrp)
Given a vector of Events, find a split that partitions them into two partitions with a given ratio of...
Definition: Orb3dLBCommon.h:544
int numParticles
Particles in the TreePiece.
Definition: MapStructures.h:87
float load
load of this TreePiece
Definition: MapStructures.h:40
Utility class for sorting processor loads.
Definition: Orb3dLBCommon.h:29
void orbPrepare(vector< Event > *tpEvents, OrientedBox< float > &box, int numobjs, BaseLB::LDStats *stats, bool node_partition=false)
Prepare structures for the ORB partition.
Definition: Orb3dLBCommon.h:308
structure for sorting TreePieces in one dimension for load balancing.
Definition: MapStructures.h:36
data for Orb3D load balancing.
Definition: MapStructures.h:80
int lbindex
index into LB stats-&gt;objData
Definition: MapStructures.h:83
int owner
Index within TreePiece array or stats-&gt;objData array of this TreePiece.
Definition: MapStructures.h:38
Hold information about Pe load and number of objects.
Definition: Orb3dLBCommon.h:20
int nextProc
index of first processor of the group we are considering
Definition: Orb3dLBCommon.h:51
double dMaxBalance
Max piece imbalance for load balancing.
Definition: ParallelGravity.cpp:103