-
Description: different loads on seperate execution entities, indiactes
- badly configured application, e.g. CPU internal structures were not taken into account
- badly distributed workload on parallel entities
- singular work, e.g. I/O or other hardware specific operations
-
Citerion:
- the minimum, median and maximum of a specific metric is not within a specifc distance from each other. Metrics can be runtime, cpu-time or other ressource specific rates
- parallel execution entities are sychronized at some point in the computation
-
Possible false positives: none
-
Possible cures/workarounds:
- implementation of dynamic work balancing,
- using equivivalent ressources
- removing synchronization
Load Imbalance #1
// rule on job level
{
"name":"Load Imbalance",
"type":"bool",
"tag":"loadimba",
"level":"job",
"metric(SLURM:CPUh)":"work", // alias for work
"metric(numberOfProcesses)":"number_of_available_compute_units",
"metric(jobWallTime)":"walltime",
"parameter": ["load_imbalance_ratio_threshold","load_imbalance_waste_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
"rule_terms":[
{"work_per_process": "GatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
{"top_performer": "max(work_per_process)"},
{"missed_work_per_prcess":"for_each(Process,$top_performer - SelectMetric(work,OverAll))"},
{"missed_work": "sum($missed_work_per_process)"},
{"available_work": "$number_of_available_compute_units * $walltime"},
{"threshold_1": "$missed_work / $available_work"},
{"threshold_2": "$missed_work - $tollerance_wasted_work"},
{"load_imbalance_1": "$threshold_1 > $load_imbalance_ratio_threshold"},
{"load_imbalance_2": "$threshold_2 > $load_imbalance_waste_threshold"},
{"load_imbalance_altert": "or($load_imbalance_1, $load_imbalance_2)"}
],
"output":"load_imbalance_altert"
"output_metric":"if($load_imbalance_1;Load Imbalance Ratio ${threshold_1}%)if($load_imbalance_2;Load Imbalance of $threshold_2 cpuh)"
"template":"This job was detected as a possible load imbalance issue."
}
Load Imbalance #2
// rule on job level
{
"name":"Load Imbalance",
"type":"bool",
"tag":"loadimba",
"level":"job",
"metric(SLURM:CPUh)":"work", // alias for work
"metric(numberOfProcesses)":"number_of_available_compute_units",
"metric(jobWallTime)":"walltime",
"parameter": ["load_imbalance_ratio_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
"rule_terms":[
{"work_per_process": "GatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
{"median": "median($work_per_process)"},
{"upper_quartile": "upper_quartile($work_per_process)"},
{"lower_quartile": "lower_quartile($work_per_process)"},
{"ratio": "for_each(Process,make_tupel($RANK,(SelectMetric(work,Overall))-$median)/$median))",
{"violations": "if($ratio.second>$load_imbalance_ratio_threshold;$ratio)",
{"load_imbalance": "cardinality($violations)>0},
],
"output":"load_imbalance_altert"
"output_metric":"$ratio.first"
"template":"This job was detected as a possible load imbalance issue. Ranks $ratio.first exceed imbalance ratio of $load_imbalance_ratio_threshold"
}