-
Description: different loads on seperate execution entities, indiactes
- badly configured application, e.g. CPU internal structures were not taken into account
- badly distributed workload on parallel entities
- singular work, e.g. I/O or other hardware specific operations
-
Citerion:
- the minimum, median and maximum of a specific metric is not within a specifc distance from each other. Metrics can be runtime, cpu-time or other ressource specific rates
- parallel execution entities are sychronized at some point in the computation
-
Possible false positives: none
-
Possible cures/workarounds:
- implementation of dynamic work balancing,
- using equivivalent ressources
- removing synchronization
Load Imbalance #1 // rule on job level { "name":"Load Imbalance", "type":"bool", "tag":"loadimba", "level":"job", "metric(numberOfProcesses)":"number_of_available_compute_units", "metric(jobWallTime)":"walltime", "parameter": ["load_imbalance_ratio_threshold","load_imbalance_waste_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing? "rule_terms":[ {"work_per_process": "GatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes {"top_performer": "max(work_per_process)"}, {"missed_work_per_prcess":"for_each(Process,top_performer - SelectMetric(work,OverAll))"}, {"missed_work": "sum(missed_work_per_process)"}, {"available_work": "$number_of_available_compute_units * walltime"}, {"threshold_1": "missed_work / available_work"}, {"threshold_2": "missed_work - tollerance_wasted_work"}, {"load_imbalance_1": "threshold_1 > load_imbalance_ratio_threshold"}, {"load_imbalance_2": "threshold_2 > load_imbalance_waste_threshold"}, {"load_imbalance_altert": "or(load_imbalance_1, load_imbalance_2)"} ], "output":"load_imbalance_altert" "output_metric":"if(load_imbalance_1;Load Imbalance Ratio {threshold_1}%)if(load_imbalance_2;Load Imbalance of $threshold_2 cpuh)" "template":"This job was detected as a possible load imbalance issue." }
Load Imbalance #2
// rule on job level
{
"name":"Load Imbalance",
"type":"bool",
"tag":"loadimba",
"level":"job",
"metric(numberOfProcesses)":"number_of_available_compute_units",
"metric(jobWallTime)":"walltime",
"parameter": ["load_imbalance_ratio_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
"rule_terms":[
{"work_per_process": "GatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
{"median": "median($work_per_process)"},
{"upper_quartile": "upper_quartile($work_per_process)"},
{"lower_quartile": "lower_quartile($work_per_process)"},
{"ratio": "for_each(Process,make_tupel($RANK,(SelectMetric(work,Overall))-$median)/$median))",
{"violations": "if($ratio.second>$load_imbalance_ratio_threshold;$ratio)",
{"load_imbalance": "cardinality($violations)>0},
],
"output":"load_imbalance_altert"
"output_metric":"$ratio.first"
"template":"This job was detected as a possible load imbalance issue. Ranks $ratio.first exceed imbalance ratio of $load_imbalance_ratio_threshold"
}