... | ... | @@ -16,27 +16,14 @@ |
|
|
|
|
|
|
|
|
Load Imbalance #1
|
|
|
```
|
|
|
Eingabe:
|
|
|
* Metric cpuh per process alias[work]
|
|
|
* Parameter threshold-factor
|
|
|
|
|
|
Rule:
|
|
|
median = cpuh_pp.median('all')
|
|
|
cpuh_pp_q1 = cpuh_pp.Q1('all')
|
|
|
cpuh_pp_q4 = cpuh_pp.Q4('all')
|
|
|
|
|
|
highmem_nodes = memory_used > mem_threshold
|
|
|
highmem = highmem_nodes.any('all')
|
|
|
|
|
|
Ausgabe: highmem ist True
|
|
|
|
|
|
// rule on job level
|
|
|
{
|
|
|
"name":"Low CPU load",
|
|
|
"name":"Load Imbalance",
|
|
|
"type":"bool",
|
|
|
"tag":"lowload",
|
|
|
"level":"job"
|
|
|
"tag":"loadimba",
|
|
|
"level":"job",
|
|
|
"metric(numberOfProcesses)":"number_of_available_compute_units",
|
|
|
"metric(jobWallTime)":"walltime",
|
|
|
"parameter": ["load_imbalance_ratio_threshold","load_imbalance_waste_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
|
|
|
"rule_terms":[
|
|
|
{"work_per_process": "GatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
|
... | ... | @@ -55,4 +42,30 @@ Ausgabe: highmem ist True |
|
|
"template":"This job was detected as a possible load imbalance issue."
|
|
|
}
|
|
|
|
|
|
```
|
|
|
|
|
|
Load Imbalance #2
|
|
|
// rule on job level
|
|
|
{
|
|
|
"name":"Load Imbalance",
|
|
|
"type":"bool",
|
|
|
"tag":"loadimba",
|
|
|
"level":"job",
|
|
|
"metric(numberOfProcesses)":"number_of_available_compute_units",
|
|
|
"metric(jobWallTime)":"walltime",
|
|
|
"parameter": ["load_imbalance_ratio_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
|
|
|
"rule_terms":[
|
|
|
{"work_per_process": "GatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
|
|
|
{"median": "median($work_per_process)"},
|
|
|
{"upper_quartile": "upper_quartile($work_per_process)"},
|
|
|
{"lower_quartile": "lower_quartile($work_per_process)"},
|
|
|
{"ratio": "for_each(Process,make_tupel($RANK,(SelectMetric(work,Overall))-$median)/$median))",
|
|
|
{"violations": "if($ratio.second>$load_imbalance_ratio_threshold;$ratio)",
|
|
|
{"load_imbalance": "cardinality($violations)>0},
|
|
|
],
|
|
|
"output":"load_imbalance_altert"
|
|
|
"output_metric":"$ratio.first"
|
|
|
"template":"This job was detected as a possible load imbalance issue. Ranks $ratio.first exceed imbalance ratio of $load_imbalance_ratio_threshold"
|
|
|
}
|
|
|
|
|
|
``` |
|
|
\ No newline at end of file |