... | ... | @@ -39,62 +39,20 @@ Ausgabe: highmem ist True |
|
|
"level":"job"
|
|
|
"parameter": ["load_imbalance_ratio_threshold","load_imbalance_waste_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
|
|
|
"rule_terms":[
|
|
|
{"work_per_process": "gatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
|
|
|
{"work_per_process": "GatherFromAllRanks(SelectMetric(work,OverAll))"}, // select the work from all processes
|
|
|
{"top_performer": "max(work_per_process)"},
|
|
|
{"missed_work_per_prcess":"for_each(Process,top_performer - SelectMetric(work,OverAll))"},
|
|
|
{"missed_work": "sum(missed_work_per_process)"},
|
|
|
{"available_work": "number_of_available_compute_units * walltime"},
|
|
|
{"threshold_1": "missed_work / available_work"},
|
|
|
{"threshold_2": "missed_work - tollerance_wasted_work"},
|
|
|
{"load_imbalance_1": "threshold_1 > load_imbalance_ratio_threshold"},
|
|
|
{"load_imbalance_2": "threshold_2 > load_imbalance_waste_threshold"},
|
|
|
{"load_imbalance_altert": "or(load_imbalance_1, load_imbalance_2)"}
|
|
|
{"missed_work_per_prcess":"for_each(Process,$top_performer - SelectMetric(work,OverAll))"},
|
|
|
{"missed_work": "sum($missed_work_per_process)"},
|
|
|
{"available_work": "$number_of_available_compute_units * $walltime"},
|
|
|
{"threshold_1": "$missed_work / $available_work"},
|
|
|
{"threshold_2": "$missed_work - $tollerance_wasted_work"},
|
|
|
{"load_imbalance_1": "$threshold_1 > $load_imbalance_ratio_threshold"},
|
|
|
{"load_imbalance_2": "$threshold_2 > $load_imbalance_waste_threshold"},
|
|
|
{"load_imbalance_altert": "or($load_imbalance_1, $load_imbalance_2)"}
|
|
|
],
|
|
|
"output":"load_imbalance_altert"
|
|
|
"output_perc":"load_perc"
|
|
|
"template":"This job was detected as lowload because the load %{lowload}"
|
|
|
"output_metric":"if($load_imbalance_1;Load Imbalance Ratio ${threshold_1}%)if($load_imbalance_2;Load Imbalance of $threshold_2 cpuh)"
|
|
|
"template":"This job was detected as a possible load imbalance issue."
|
|
|
}
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
```
|
|
|
// separately defined input parameters
|
|
|
{
|
|
|
"load_threshold_factor":0.9,
|
|
|
}
|
|
|
|
|
|
// rule on node level
|
|
|
{
|
|
|
"name":"Low CPU load",
|
|
|
"type":"bool",
|
|
|
"tag":"lowload",
|
|
|
"level":"node" // -> rule evaluated for each node, cpu_load -> values for one node
|
|
|
"parameter": ["load_threshold"]
|
|
|
"rule_terms":[ // -> Array of objects to keep order of terms (in contrast to Object with all terms)
|
|
|
{"load_mean": "cpu_load.mean('all')"},
|
|
|
{"load_threshold": "cores_per_node * load_threshold_factor"},
|
|
|
{"lowload": "load_mean < load_threshold"},
|
|
|
{"load_perc": "load_mean / load_max"}
|
|
|
],
|
|
|
"output":"lowload"
|
|
|
"output_perc":"load_perc"
|
|
|
"template":"This job was detected as lowload because the load %{lowload}"
|
|
|
},
|
|
|
// rule on job level, for node exclusive jobs only
|
|
|
{
|
|
|
"name":"Low CPU load",
|
|
|
"type":"bool",
|
|
|
"tag":"lowload",
|
|
|
"level":"job"
|
|
|
"parameter": ["load_threshold"] // <-- explicit list of used parameters? or can be gathered from code parsing?
|
|
|
"rule_terms":[
|
|
|
{"load_mean": "cpu_load.mean('all')"},
|
|
|
{"load_threshold": "cores_per_node * load_threshold_factor"},
|
|
|
{"lowload": "or(load_mean < load_threshold)"}, // <-- operator overloading for matrices, vectors
|
|
|
{"load_perc": "load_mean / load_max"}
|
|
|
],
|
|
|
"output":"lowload"
|
|
|
"output_perc":"load_perc"
|
|
|
"template":"This job was detected as lowload because the load %{lowload}"
|
|
|
}, |
|
|
``` |
|
|
\ No newline at end of file |