9 #include "TopoManager.h"
13 #include "TaggedVector3D.h"
15 #include "CentralLB.h"
16 #define ORB3DLB_NOTOPO_DEBUG(X)
25 PeInfo(
int id,
double ld,
int it) : idx(
id), load(ld), items(it) {}
33 return (p1.load > p2.load);
41 decltype(BaseLB::LDStats::to_proc) *mapping;
42 decltype(BaseLB::LDStats::from_proc) *from;
44 CkVec<float> procload;
54 void orbPePartition(vector<Event> *events, vector<OrbObject> &tp,
int node,
55 BaseLB::LDStats *stats) {
57 std::vector<PeInfo> peinfo;
58 float totalLoad = 0.0;
59 int firstProc = CkNodeFirst(node);
60 int lastProc = firstProc + CkNodeSize(node) - 1;
61 for (
int i = firstProc; i <= lastProc; i++) {
62 peinfo.push_back(
PeInfo(i, 0.0, 0));
65 std::make_heap(peinfo.begin(), peinfo.end(),
ProcLdGreater());
68 for(
int i = 0; i < events[XDIM].size(); i++){
69 Event &ev = events[XDIM][i];
86 int fromPE = (*from)[orb.
lbindex];
87 procload[fromPE] += ev.
load;
103 void orbPartition(vector<Event> *events, OrientedBox<float> &box,
int nprocs,
104 vector<OrbObject> & tp, BaseLB::LDStats *stats,
105 bool node_partition=
false){
107 ORB3DLB_NOTOPO_DEBUG((
"partition events %d %d %d nprocs %d\n",
113 int numEvents = events[XDIM].size();
114 CkAssert(numEvents == events[YDIM].size());
115 CkAssert(numEvents == events[ZDIM].size());
121 ORB3DLB_NOTOPO_DEBUG((
"base: assign %d tps to proc %d\n", numEvents,
nextProc));
122 if (!stats->procs[
nextProc].available) {
129 if (node_partition) {
130 orbPePartition(events, tp,
nextProc, stats);
134 float totalLoad = 0.0;
135 for(
int i = 0; i < events[XDIM].size(); i++){
136 Event &ev = events[XDIM][i];
140 totalLoad += ev.
load;
143 int fromPE = (*from)[orb.
lbindex];
144 if (fromPE < 0 || fromPE >= procload.size()) {
145 CkPrintf(
"[%d] trying to access fromPe %d nprocs %lu\n", CkMyPe(), fromPE, procload.size());
146 CkAbort(
"Trying to access a PE which is outside the range\n");
148 procload[fromPE] += ev.
load;
160 int longestDim = XDIM;
161 float longestDimLength = box.greater_corner[longestDim] - box.lesser_corner[longestDim];
162 for(
int i = YDIM; i <= ZDIM; i++){
163 float thisDimLength = box.greater_corner[i]-box.lesser_corner[i];
164 if(thisDimLength > longestDimLength){
165 longestDimLength = thisDimLength;
170 ORB3DLB_NOTOPO_DEBUG((
"dimensions %f %f %f longest %d\n",
171 box.greater_corner[XDIM]-box.lesser_corner[XDIM],
172 box.greater_corner[YDIM]-box.lesser_corner[YDIM],
173 box.greater_corner[ZDIM]-box.lesser_corner[ZDIM],
177 int nlprocs = nprocs/2;
178 int nrprocs = nprocs-nlprocs;
180 float ratio = (1.0*nlprocs)/(1.0*(nlprocs+nrprocs));
183 float bglprocs = 0.0;
185 bglprocs += stats->procs[np].bg_walltime;
186 float bgrprocs = 0.0;
188 bgrprocs += stats->procs[np].bg_walltime;
190 ORB3DLB_NOTOPO_DEBUG((
"nlprocs %d nrprocs %d ratio %f\n", nlprocs, nrprocs, ratio));
194 if(splitIndex == numEvents) {
195 ORB3DLB_NOTOPO_DEBUG((
"evenly split 0 load\n"));
196 splitIndex = splitIndex/2;
198 int nleft = splitIndex;
199 int nright = numEvents-nleft;
202 if(nright < nrprocs) {
204 nleft = splitIndex = numEvents-nright;
205 CkAssert(nleft >= nlprocs);
207 else if(nleft < nlprocs) {
208 nleft = splitIndex = nlprocs;
209 nright = numEvents-nleft;
210 CkAssert(nright >= nrprocs);
215 nleft = splitIndex = (int) (nlprocs*maxPieceProc);
216 nright = numEvents-nleft;
218 else if (nright > nrprocs*maxPieceProc) {
219 nright = (int) (nrprocs*maxPieceProc);
220 nleft = splitIndex = numEvents-nright;
222 CkAssert(splitIndex >= 0);
223 CkAssert(splitIndex < numEvents);
225 OrientedBox<float> leftBox;
226 OrientedBox<float> rightBox;
228 leftBox = rightBox = box;
229 float splitPosition = events[longestDim][splitIndex].position;
230 leftBox.greater_corner[longestDim] = splitPosition;
231 rightBox.lesser_corner[longestDim] = splitPosition;
234 for(
int i = 0; i < splitIndex; i++){
235 Event &ev = events[longestDim][i];
236 CkAssert(ev.
owner >= 0);
237 CkAssert(tp[ev.
owner].partition == INVALID_PARTITION);
238 tp[ev.
owner].partition = LEFT_PARTITION;
240 for(
int i = splitIndex; i < numEvents; i++){
241 Event &ev = events[longestDim][i];
242 CkAssert(ev.
owner >= 0);
243 CkAssert(tp[ev.
owner].partition == INVALID_PARTITION);
244 tp[ev.
owner].partition = RIGHT_PARTITION;
247 vector<Event> leftEvents[NDIMS];
248 vector<Event> rightEvents[NDIMS];
250 for(
int i = 0; i < NDIMS; i++){
252 leftEvents[i].resize(nleft);
253 rightEvents[i].resize(nright);
256 leftEvents[i].reserve(nleft);
257 rightEvents[i].reserve(nright);
261 memcpy(&leftEvents[longestDim][0],&events[longestDim][0],
sizeof(
Event)*nleft);
262 memcpy(&rightEvents[longestDim][0],&events[longestDim][splitIndex],
sizeof(
Event)*nright);
265 for(
int i = XDIM; i <= ZDIM; i++){
266 if(i == longestDim)
continue;
267 for(
int j = 0; j < numEvents; j++){
268 Event &ev = events[i][j];
269 CkAssert(ev.
owner >= 0);
271 CkAssert(orb.partition != INVALID_PARTITION);
272 if(orb.partition == LEFT_PARTITION) leftEvents[i].push_back(ev);
273 else if(orb.partition == RIGHT_PARTITION) rightEvents[i].push_back(ev);
281 vector<Event> &eraseVec = events[longestDim];
282 for(
int i = 0; i < numEvents; i++){
283 Event &ev = eraseVec[i];
284 CkAssert(ev.
owner >= 0);
286 CkAssert(orb.partition != INVALID_PARTITION);
287 orb.partition = INVALID_PARTITION;
294 for(
int i = 0; i < NDIMS; i++){
296 vector<Event>().swap(events[i]);
298 orbPartition(leftEvents,leftBox,nlprocs,tp, stats, node_partition);
299 orbPartition(rightEvents,rightBox,nrprocs,tp, stats, node_partition);
308 void orbPrepare(vector<Event> *tpEvents, OrientedBox<float> &box,
int
309 numobjs, BaseLB::LDStats * stats,
bool node_partition=
false){
311 int nmig = stats->n_migrateobjs;
317 if (node_partition) {
326 CkAssert(tpEvents[XDIM].size() == numobjs);
327 CkAssert(tpEvents[YDIM].size() == numobjs);
328 CkAssert(tpEvents[ZDIM].size() == numobjs);
330 mapping = &stats->to_proc;
331 from = &stats->from_proc;
333 CkPrintf(
"[Orb3dLB_notopo] sorting\n");
334 for(
int i = 0; i < NDIMS; i++){
335 sort(tpEvents[i].begin(),tpEvents[i].end());
338 box.lesser_corner.x = tpEvents[XDIM][0].position;
339 box.lesser_corner.y = tpEvents[YDIM][0].position;
340 box.lesser_corner.z = tpEvents[ZDIM][0].position;
342 box.greater_corner.x = tpEvents[XDIM][numobjs-1].position;
343 box.greater_corner.y = tpEvents[YDIM][numobjs-1].position;
344 box.greater_corner.z = tpEvents[ZDIM][numobjs-1].position;
348 procload.resize(stats->nprocs());
349 for(
int i = 0; i < stats->nprocs(); i++){
350 procload[i] = stats->procs[i].bg_walltime;
355 void refine(BaseLB::LDStats *stats,
int numobjs){
357 int *from_procs = Refiner::AllocProcs(stats->nprocs(), stats);
358 int *to_procs = Refiner::AllocProcs(stats->nprocs(), stats);
362 for(
int i = 0; i < numobjs; i++){
363 if(stats->to_proc[i] != stats->from_proc[i]) migr++;
365 int pe = stats->to_proc[i];
371 int numRefineMigrated = 0;
373 CkPrintf(
"[orb3dlb_notopo] refine\n");
374 Refiner refiner(1.050);
375 refiner.Refine(stats->nprocs(),stats,from_procs,to_procs);
377 for(
int i = 0; i < numobjs; i++){
378 if(to_procs[i] != from_procs[i]) numRefineMigrated++;
379 stats->to_proc[i] = to_procs[i];
383 double *predLoad =
new double[stats->nprocs()];
384 double *predCount =
new double[stats->nprocs()];
385 for(
int i = 0; i < stats->nprocs(); i++){
390 double maxObjLoad = 0.0;
392 for(
int i = 0; i < numobjs; i++){
393 double ld = stats->objData[i].wallTime;
394 int proc = stats->to_proc[i];
395 predLoad[proc] += ld;
396 predCount[proc] += 1.0;
401 double minWall = 0.0;
402 double maxWall = 0.0;
403 double avgWall = 0.0;
405 double minIdle = 0.0;
406 double maxIdle = 0.0;
407 double avgIdle = 0.0;
413 double avgPred = 0.0;
414 double minPred = 0.0;
415 double maxPred = 0.0;
417 double avgPiece = 0.0;
418 double minPiece = 0.0;
419 double maxPiece = 0.0;
421 CkPrintf(
"***************************\n");
423 for(
int i = 0; i < stats->nprocs(); i++){
424 double wallTime = stats->procs[i].total_walltime;
425 double idleTime = stats->procs[i].idletime;
426 double bgTime = stats->procs[i].bg_walltime;
427 double pred = predLoad[i];
428 double npiece = predCount[i];
445 if(i==0 || minWall > wallTime) minWall = wallTime;
446 if(i==0 || maxWall < wallTime) maxWall = wallTime;
448 if(i==0 || minIdle > idleTime) minIdle = idleTime;
449 if(i==0 || maxIdle < idleTime) maxIdle = idleTime;
451 if(i==0 || minBg > bgTime) minBg = bgTime;
452 if(i==0 || maxBg < bgTime) maxBg = bgTime;
454 if(i==0 || minPred > pred) minPred = pred;
455 if(i==0 || maxPred < pred) maxPred = pred;
457 if(i==0 || minPiece > npiece) minPiece = npiece;
458 if(i==0 || maxPiece < npiece) maxPiece = npiece;
462 avgWall /= stats->nprocs();
463 avgIdle /= stats->nprocs();
464 avgBg /= stats->nprocs();
465 avgPred /= stats->nprocs();
466 avgPiece /= stats->nprocs();
468 #ifdef PRINT_LOAD_PERCENTILES
470 vector<double> objectWallTimes;
471 for(
int i = 0; i < stats->nprocs(); i++){
472 double wallTime = stats->procs[i].total_walltime;
473 objectWallTimes.push_back(wallTime);
474 accumVar += (wallTime - avgWall) * (wallTime - avgWall);
476 double stdDev = sqrt(accumVar / stats->nprocs());
477 CkPrintf(
"Average load: %.3f\n", avgWall);
478 CkPrintf(
"Standard deviation: %.3f\n", stdDev);
480 std::sort(objectWallTimes.begin(), objectWallTimes.end());
481 CkPrintf(
"Object load percentiles: \n");
482 double increment = (double) objectWallTimes.size() / 10;
485 for (
int j = 0; j < 100; j += 10) {
487 CkPrintf(
"%d: %.3f\n", j, objectWallTimes[(
int) index]);
489 CkPrintf(
"100: %.3f\n", objectWallTimes.back());
496 float minload, maxload, avgload;
497 minload = maxload = procload[0];
499 for(
int i = 0; i < stats->nprocs(); i++){
500 CkPrintf(
"pe %d load %f box %f %f %f %f %f %f\n", i, procload[i],
501 procbox[i].lesser_corner.x,
502 procbox[i].lesser_corner.y,
503 procbox[i].lesser_corner.z,
504 procbox[i].greater_corner.x,
505 procbox[i].greater_corner.y,
506 procbox[i].greater_corner.z
508 avgload += procload[i];
509 if(minload > procload[i]) minload = procload[i];
510 if(maxload < procload[i]) maxload = procload[i];
513 avgload /= stats->nprocs();
515 CkPrintf(
"Orb3dLB_notopo stats: min %f max %f avg %f max/avg %f\n", minload, maxload, avgload, maxload/avgload);
519 CkPrintf(
"Orb3dLB_notopo stats: maxObjLoad %f\n", maxObjLoad);
520 CkPrintf(
"Orb3dLB_notopo stats: minWall %f maxWall %f avgWall %f maxWall/avgWall %f\n", minWall, maxWall, avgWall, maxWall/avgWall);
521 CkPrintf(
"Orb3dLB_notopo stats: minIdle %f maxIdle %f avgIdle %f minIdle/avgIdle %f\n", minIdle, maxIdle, avgIdle, minIdle/avgIdle);
522 CkPrintf(
"Orb3dLB_notopo stats: minPred %f maxPred %f avgPred %f maxPred/avgPred %f\n", minPred, maxPred, avgPred, maxPred/avgPred);
523 CkPrintf(
"Orb3dLB_notopo stats: minPiece %f maxPiece %f avgPiece %f maxPiece/avgPiece %f\n", minPiece, maxPiece, avgPiece, maxPiece/avgPiece);
525 CkPrintf(
"Orb3dLB_notopo stats: minBg %f maxBg %f avgBg %f maxBg/avgBg %f\n", minBg, maxBg, avgBg, maxBg/avgBg);
526 CkPrintf(
"Orb3dLB_notopo stats: orb migrated %d refine migrated %d objects\n", migr, numRefineMigrated);
530 Refiner::FreeProcs(from_procs);
531 Refiner::FreeProcs(to_procs);
546 float approxBgPerEvent = (bglp + bgrp) / events.size();
547 float totalLoad = bglp + bgrp;
548 for(
int i = 0; i < events.size(); i++){
549 totalLoad += events[i].load;
553 float perfectLoad = ratio * totalLoad;
554 ORB3DLB_NOTOPO_DEBUG((
"partitionRatioLoad bgl %f bgr %f\n",
557 float prevLoad = 0.0;
558 float leftLoadAtSplit = 0.0;
559 for(splitIndex = 0; splitIndex < events.size(); splitIndex++){
561 leftLoadAtSplit += events[splitIndex].load + approxBgPerEvent;
563 if (leftLoadAtSplit > perfectLoad) {
564 if ( fabs(leftLoadAtSplit - perfectLoad) < fabs(prevLoad - perfectLoad) ) {
568 leftLoadAtSplit = prevLoad;
572 prevLoad = leftLoadAtSplit;
575 ORB3DLB_NOTOPO_DEBUG((
"partitionEvenLoad mid %d lload %f rload %f ratio %f\n", splitIndex, leftLoadAtSplit, totalLoad - leftLoadAtSplit, leftLoadAtSplit / totalLoad));
void orbPartition(vector< Event > *events, OrientedBox< float > &box, int nprocs, vector< OrbObject > &tp, BaseLB::LDStats *stats, bool node_partition=false)
Recursively partition treepieces among processors by bisecting the load in orthogonal directions...
Definition: Orb3dLBCommon.h:103
double maxPieceProc
Definition: Orb3dLBCommon.h:48
Common methods among Orb3d class load balancers.
Definition: Orb3dLBCommon.h:38
int partitionRatioLoad(vector< Event > &events, float ratio, float bglp, float bgrp)
Given a vector of Events, find a split that partitions them into two partitions with a given ratio of...
Definition: Orb3dLBCommon.h:544
int numParticles
Particles in the TreePiece.
Definition: MapStructures.h:87
float load
load of this TreePiece
Definition: MapStructures.h:40
Utility class for sorting processor loads.
Definition: Orb3dLBCommon.h:29
void orbPrepare(vector< Event > *tpEvents, OrientedBox< float > &box, int numobjs, BaseLB::LDStats *stats, bool node_partition=false)
Prepare structures for the ORB partition.
Definition: Orb3dLBCommon.h:308
structure for sorting TreePieces in one dimension for load balancing.
Definition: MapStructures.h:36
data for Orb3D load balancing.
Definition: MapStructures.h:80
int lbindex
index into LB stats->objData
Definition: MapStructures.h:83
int owner
Index within TreePiece array or stats->objData array of this TreePiece.
Definition: MapStructures.h:38
Hold information about Pe load and number of objects.
Definition: Orb3dLBCommon.h:20
int nextProc
index of first processor of the group we are considering
Definition: Orb3dLBCommon.h:51
double dMaxBalance
Max piece imbalance for load balancing.
Definition: ParallelGravity.cpp:103