diff --git a/MBB.py b/MBB.py
index a25cad7d711f522744ac5bdd9aed2a391258fb3f..41798de7a36382634bc44564717fc102d2711220 100755
--- a/MBB.py
+++ b/MBB.py
@@ -561,26 +561,32 @@ iframe {
 
     os.chdir(here)
 
+
 # expects a df with at least ["ERROR_EXPECTED","any_error_reported","TP","category"]
 # classifies as FN,FP,TN,...
 def classify_tests(df_in):
     df = df_in[["test_id", "ERROR_EXPECTED", "any_error_reported", "category", "CE", "RE", "TP"]].copy()
-    
-    df["TN"] = (df["ERROR_EXPECTED"] == False) & (df["any_error_reported"] == False) & (df["CE"] == False) & (df["RE"] == False)
-    df["FN"] = (df["ERROR_EXPECTED"] == True) & (df["any_error_reported"] == False) & (df["CE"] == False) & (df["RE"] == False)
+
+    df["TN"] = (df["ERROR_EXPECTED"] == False) & (df["any_error_reported"] == False) & (df["CE"] == False) & (
+                df["RE"] == False)
+    df["FN"] = (df["ERROR_EXPECTED"] == True) & (df["any_error_reported"] == False) & (df["CE"] == False) & (
+                df["RE"] == False)
     df["FP"] = (((df["ERROR_EXPECTED"] == False) & df["any_error_reported"]) |  # a true false positive
                 # or a case where a not-helpful report is produced
-                ((df["ERROR_EXPECTED"] == True) & df["any_error_reported"] & (df["TP"] == False))) & (df["CE"] == False) & (df["RE"] == False)
+                ((df["ERROR_EXPECTED"] == True) & df["any_error_reported"] & (df["TP"] == False))) & (
+                           df["CE"] == False) & (df["RE"] == False)
 
     # so that this information is available per category
     df["ERROR_NOT_EXPECTED"] = (df["ERROR_EXPECTED"] == False)
 
     # every case is exactely one of this
-    assert df["TP"].sum() + df["FP"].sum() + df["TN"].sum() + df["FN"].sum() + df["CE"].sum() + df["RE"].sum() == len(df)
+    assert df["TP"].sum() + df["FP"].sum() + df["TN"].sum() + df["FN"].sum() + df["CE"].sum() + df["RE"].sum() == len(
+        df)
     assert df["ERROR_EXPECTED"].sum() + df["ERROR_NOT_EXPECTED"].sum() == len(df)
 
     return df
 
+
 # aggregate metrics and calculate precision recall F1 based on this
 def aggregate_metrics_per_category(df_in):
     total_tests = len(df_in)
@@ -595,7 +601,10 @@ def aggregate_metrics_per_category(df_in):
     df["conclusiveness"] = 1 - ((df["CE"] + df["RE"]) / total_tests)
     df["f1"] = (df["TP"] + df["TP"]) / (df["TP"] + df["TP"] + df["FP"] + df["FN"])
 
-    return df[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]]
+    return df[
+        ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1",
+         "overallaccuracy"]]
+
 
 def read_tool_reports(rootdir, toolname):
     if not toolname in tools:
@@ -604,7 +613,7 @@ def read_tool_reports(rootdir, toolname):
     if not os.path.exists(f'{args.logs_dir}/{toolname}'):
         raise Exception(f"Not found Logs for {toolname}.")
 
-    results=[]
+    results = []
 
     for test in todo:
         binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
@@ -614,8 +623,8 @@ def read_tool_reports(rootdir, toolname):
         expected = test['expect']
 
         resulting_categorization = categorize(tool=tools[toolname], toolname=toolname,
-                                                  test=test, test_id=test_id, logs_dir=args.logs_dir,
-                                                  )
+                                              test=test, test_id=test_id, logs_dir=args.logs_dir,
+                                              )
         resulting_categorization["test_id"] = test_id
         resulting_categorization["category"] = test["category"]
         results.append(resulting_categorization)
@@ -623,18 +632,23 @@ def read_tool_reports(rootdir, toolname):
     df = pd.DataFrame(results)
 
     df["TP_base"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & (df["CE"] == False) & (df["RE"] == False)
-    df["TP_class"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & (df["CE"] == False) & (df["RE"] == False)
-    df["TP_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_line_reported"] & (df["CE"] == False) & (df["RE"] == False)
+    df["TP_class"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & (
+                df["CE"] == False) & (df["RE"] == False)
+    df["TP_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_line_reported"] & (
+                df["CE"] == False) & (df["RE"] == False)
     df["TP_class_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & df[
         "correct_line_reported"] & (df["CE"] == False) & (df["RE"] == False)
     df["TP_class_line_no_class_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[
-        "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_class"]) & (df["CE"] == False) & (df["RE"] == False)
+        "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_class"]) & (
+                                                     df["CE"] == False) & (df["RE"] == False)
     df["TP_class_line_no_line_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[
-        "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_line"]) & (df["CE"] == False) & (df["RE"] == False)
+        "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_line"]) & (df["CE"] == False) & (
+                                                    df["RE"] == False)
 
     return df
 
-def cmd_csv(rootdir, toolnames,print_to_console=False):
+
+def cmd_csv(rootdir, toolnames, print_to_console=False):
     here = os.getcwd()
     os.chdir(rootdir)
     outpath = f'{rootdir}/csv/'
@@ -654,7 +668,8 @@ def cmd_csv(rootdir, toolnames,print_to_console=False):
             print(f"=== {toolname} ===")
 
         # Output for each type of TP
-        for (colname) in ["base", "class", "line", "class_line", "class_line_no_line_noise", "class_line_no_line_noise", "class_line_no_class_noise"]:
+        for (colname) in ["base", "class", "line", "class_line", "class_line_no_line_noise", "class_line_no_line_noise",
+                          "class_line_no_class_noise"]:
             df["TP"] = df[f"TP_{colname}"]
             df_classified = classify_tests(df)
             df_classified.to_csv(f'{outpath}/{toolname}_{colname}_full.csv', index=False)
@@ -662,8 +677,12 @@ def cmd_csv(rootdir, toolnames,print_to_console=False):
             df_result.to_csv(f'{outpath}/{toolname}_{colname}.csv', index=True)
             if print_to_console:
                 print(f"\n{colname}:")
-                print(df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]])
-                df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex(f'{outpath}/{toolname}_{colname}.tex')
+                print(df_result[
+                          ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall",
+                           "precision", "f1", "overallaccuracy"]])
+                df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall",
+                           "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex(
+                    f'{outpath}/{toolname}_{colname}.tex')
 
         df_noise_per_tool = df.groupby("category").sum()
         df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0)
@@ -672,7 +691,7 @@ def cmd_csv(rootdir, toolnames,print_to_console=False):
         if print_to_console:
             print("overall_noise")
             print(df_noise_per_tool["noise_ratio"])
-        df_overall_noise_ratio[toolname] =df_noise_per_tool["noise_ratio"]
+        df_overall_noise_ratio[toolname] = df_noise_per_tool["noise_ratio"]
 
         df_copy = df.copy()
         df_copy.loc[df_copy['ERROR_EXPECTED'] == False, ['num_noise_class_line', 'num_error_reports']] = 0
@@ -727,7 +746,7 @@ def plot_helpfulness(df, outpath, toolname):
         mpatches.Patch(color=colors[3], label='not helpful report')
     ]
     ax.legend(handles=handles, ncol=2, loc='center left', bbox_to_anchor=(0.05, -0.3))
-    #ax.set_title(f"Helpfulness of {toolname.upper()} Error Reports")
+    # ax.set_title(f"Helpfulness of {toolname.upper()} Error Reports")
     ax.set_xlabel("Percentage of error reports")
     ax.set_ylabel("MPI feature")
     plt.tight_layout()
@@ -1475,6 +1494,77 @@ def cmd_latex(rootdir, toolnames):
     os.chdir(here)
 
 
+def get_overview_plot(data, outpath,scrutiny="base"):
+    assert len(data) > 0
+    # get the column names = all the metrics tht are calculated
+    df_first_tool=    next(iter(data.values()))
+    df_first_tool["TP"] = df_first_tool[f"TP_{scrutiny}"]
+    df_first_tool = classify_tests(df_first_tool)
+    df_first_tool = aggregate_metrics_per_category(df_first_tool)
+    cols = df_first_tool.columns
+
+    df_coll = pd.DataFrame(columns=cols)
+    df_rma= pd.DataFrame(columns=cols)
+    df_p2p = pd.DataFrame(columns=cols)
+    df_total = pd.DataFrame(columns=cols)
+
+    for toolname, df in data.items():
+        df["TP"] = df[f"TP_{scrutiny}"]
+        df = classify_tests(df)
+        df = aggregate_metrics_per_category(df)
+        df_coll.loc[toolname] = df.loc["COLL"]
+        df_rma.loc[toolname] = df.loc["RMA"]
+        df_p2p.loc[toolname] = df.loc["P2P"]
+        df_total.loc[toolname] = df.loc["ALL"]
+
+
+    SMALL_SIZE = 20
+    MEDIUM_SIZE = 22
+    BIGGER_SIZE = 24
+
+    plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
+    plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
+    plt.rc('axes', labelsize=SMALL_SIZE)  # fontsize of the x and y labels
+    plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
+    plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
+    plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
+    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
+
+    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 9))  #
+
+    # colors = ['#228833', '#66ccee', '#ee6677', '#aa3377', '#ccbb44', '#bbbbbb']
+    colors = ['#6699CC', '#EECC66', '#004488', '#997700', '#BBBBBB', '#000000']
+
+    ((ax1, ax2), (ax3, ax4)) = axs
+    df_p2p[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax1, legend=False, color=colors)
+    ax1.set_title('P2P')
+    handles, labels = ax1.get_legend_handles_labels()
+
+    df_coll[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax2, legend=False, color=colors)
+    ax2.set_title('Collective')
+    ax2.yaxis.tick_right()
+    # Set the y-axis labels to uppercase
+    ax2.set_yticklabels([label.get_text().upper() for label in ax2.get_yticklabels()])
+
+    df_rma[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax3, legend=False, color=colors)
+    ax3.set_title('RMA')
+
+    df_total[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax4, legend=False, color=colors)
+    ax4.set_title('Total')
+    ax4.yaxis.tick_right()
+
+    for ax in [ax1, ax2, ax3, ax4]:
+        ax.set_ylabel('')
+        # Set the y-axis labels to uppercase
+        ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()])
+
+    fig.legend(handles, labels, loc='upper center', ncols=6, bbox_to_anchor=(0.5, 1.05), )
+
+    plt.tight_layout()
+
+    plt.savefig(os.path.join(outpath, "overview_per_cat.pdf"), bbox_inches="tight")
+
+
 ########################
 # cmd_plots(): what to do when '-c plots' is used (extract the statistics of this tool)
 ########################
@@ -1485,10 +1575,13 @@ def cmd_plots(rootdir, toolnames, ext="pdf"):
     os.makedirs('plots', exist_ok=True)
     outpath = f'{rootdir}/plots/'
 
+    collected_data = {}
     for toolname in toolnames:
         df = read_tool_reports(rootdir, toolname)
-        plot_helpfulness(df,outpath,toolname)
+        plot_helpfulness(df, outpath, toolname)
+        collected_data[toolname]=df
 
+    get_overview_plot(collected_data,outpath)
 
     os.chdir(here)
 
@@ -1600,7 +1693,7 @@ elif args.c == 'plots':
     if not plots_loaded:
         print("[MBB] Error: Dependancies ('numpy' or 'matplotlib') are not available!")
         exit(-1)
-    extract_all_todo_from_logdir(args.x, args.logs_dir)
+    extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
     if args.x == 'mpirun':
         # toolnames=['itac', 'simgrid', 'must', 'aislinn', 'civl', 'isp', 'mpisv', 'parcoach', 'hermes', 'mpi-checker']
         toolnames = ['itac', 'must', 'parcoach']
diff --git a/scripts/result_plot.py b/scripts/result_plot.py
deleted file mode 100644
index b62b5ae835dba524d862eecc5ef9e8001285faf3..0000000000000000000000000000000000000000
--- a/scripts/result_plot.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# script to generate the evaluation results plot shown in the paper
-
-import pandas as pd
-import os
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-sns.set_theme()
-sns.set_style("whitegrid")
-
-# input path
-input_path = "/home/tim/mpi-bugbench/logs/mpi-bugbench-results/logs-20240723-151721/csv"
-# output path
-plot_path = "/home/tim/paper/2024_eurompi_mpi-bugbench-paper/media"
-
-df_itac = pd.read_csv(os.path.join(input_path, "itac_base.csv"), index_col=0)
-df_parcoach = pd.read_csv(os.path.join(input_path, "parcoach_base.csv"), index_col=0)
-df_must = pd.read_csv(os.path.join(input_path, "must_base.csv"), index_col=0)
-
-df_coll = pd.DataFrame(columns=df_itac.columns)
-df_coll.loc["MUST"] = df_must.loc["COLL"]
-df_coll.loc["ITAC"] = df_itac.loc["COLL"]
-df_coll.loc["PARCOACH"] = df_parcoach.loc["COLL"]
-
-df_p2p = pd.DataFrame(columns=df_itac.columns)
-df_p2p.loc["MUST"] = df_must.loc["P2P"]
-df_p2p.loc["ITAC"] = df_itac.loc["P2P"]
-df_p2p.loc["PARCOACH"] = df_parcoach.loc["P2P"]
-
-df_rma = pd.DataFrame(columns=df_itac.columns)
-df_rma.loc["MUST"] = df_must.loc["RMA"]
-df_rma.loc["ITAC"] = df_itac.loc["RMA"]
-df_rma.loc["PARCOACH"] = df_parcoach.loc["RMA"]
-
-df_total = pd.DataFrame(columns=df_itac.columns)
-df_total.loc["MUST"] = df_must.loc["ALL"]
-df_total.loc["ITAC"] = df_itac.loc["ALL"]
-df_total.loc["PARCOACH"] = df_parcoach.loc["ALL"]
-
-SMALL_SIZE = 20
-MEDIUM_SIZE = 22
-BIGGER_SIZE = 24
-
-plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
-plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
-plt.rc('axes', labelsize=SMALL_SIZE)  # fontsize of the x and y labels
-plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
-plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
-plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
-plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
-
-fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 9))  #
-
-#colors = ['#228833', '#66ccee', '#ee6677', '#aa3377', '#ccbb44', '#bbbbbb']
-colors = ['#6699CC', '#EECC66', '#004488', '#997700', '#BBBBBB', '#000000']
-colors_2 = {'TP': '#117733', 'TN': '#88CCEE', 'FP': '#CC6677', 'FN': '#AA4499', 'RE': '#DDCC77', 'CE': '#DDDDDD'}
-
-hatches= ["","",".",".","",""]
-
-((ax1, ax2), (ax3, ax4)) = axs
-df_p2p[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax1, legend=False,color=colors)
-ax1.set_title('P2P')
-handles, labels = ax1.get_legend_handles_labels()
-
-df_coll[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax2, legend=False, color=colors)
-ax2.set_title('Collective')
-ax2.yaxis.tick_right()
-# Set the y-axis labels to uppercase
-ax2.set_yticklabels([label.get_text().upper() for label in ax2.get_yticklabels()])
-
-df_rma[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax3, legend=False, color=colors)
-ax3.set_title('RMA')
-
-df_total[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax4, legend=False, color=colors)
-ax4.set_title('Total')
-ax4.yaxis.tick_right()
-
-for ax in [ax1, ax2, ax3, ax4]:
-    ax.set_ylabel('')
-    # Set the y-axis labels to uppercase
-    ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()])
-
-fig.legend(handles, labels, loc='upper center', ncols=6, bbox_to_anchor=(0.5, 1.05), )
-
-plt.tight_layout()
-
-plt.savefig(os.path.join(plot_path, "results_per_cat.pdf"), bbox_inches="tight")