Update Load Imbalance Pattern authored by Iwainsky, Christian's avatar Iwainsky, Christian
......@@ -13,3 +13,88 @@
* implementation of dynamic work balancing,
* using equivivalent ressources
* removing synchronization
Load Imbalance #1
```
Eingabe:
* Metric cpuh per process alias[work]
* Parameter threshold-factor
Rule:
median = cpuh_pp.median('all')
cpuh_pp_q1 = cpuh_pp.Q1('all')
cpuh_pp_q4 = cpuh_pp.Q4('all')
highmem_nodes = memory_used > mem_threshold
highmem = highmem_nodes.any('all')
Ausgabe: highmem ist True
// rule on job level
{
"name":"Low CPU load",
"type":"bool",
"tag":"lowload",
"level":"job"
"parameter": ["load_imbalance_ratio_threshold","load_imbalance_waste_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
"rule_terms":[
{"work_per_process": "gatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
{"top_performer": "max(work_per_process)"},
{"missed_work_per_prcess":"for_each(Process,top_performer - SelectMetric(work,OverAll))"},
{"missed_work": "sum(missed_work_per_process)"},
{"available_work": "number_of_available_compute_units * walltime"},
{"threshold_1": "missed_work / available_work"},
{"threshold_2": "missed_work - tollerance_wasted_work"},
{"load_imbalance_1": "threshold_1 > load_imbalance_ratio_threshold"},
{"load_imbalance_2": "threshold_2 > load_imbalance_waste_threshold"},
{"load_imbalance_altert": "or(load_imbalance_1, load_imbalance_2)"}
],
"output":"load_imbalance_altert"
"output_perc":"load_perc"
"template":"This job was detected as lowload because the load %{lowload}"
}
```
```
// separately defined input parameters
{
"load_threshold_factor":0.9,
}
// rule on node level
{
"name":"Low CPU load",
"type":"bool",
"tag":"lowload",
"level":"node" // -> rule evaluated for each node, cpu_load -> values for one node
"parameter": ["load_threshold"]
"rule_terms":[ // -> Array of objects to keep order of terms (in contrast to Object with all terms)
{"load_mean": "cpu_load.mean('all')"},
{"load_threshold": "cores_per_node * load_threshold_factor"},
{"lowload": "load_mean < load_threshold"},
{"load_perc": "load_mean / load_max"}
],
"output":"lowload"
"output_perc":"load_perc"
"template":"This job was detected as lowload because the load %{lowload}"
},
// rule on job level, for node exclusive jobs only
{
"name":"Low CPU load",
"type":"bool",
"tag":"lowload",
"level":"job"
"parameter": ["load_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
"rule_terms":[
{"load_mean": "cpu_load.mean('all')"},
{"load_threshold": "cores_per_node * load_threshold_factor"},
{"lowload": "or(load_mean < load_threshold)"}, // <-- operator overloading for matrices, vectors
{"load_perc": "load_mean / load_max"}
],
"output":"lowload"
"output_perc":"load_perc"
"template":"This job was detected as lowload because the load %{lowload}"
},