diff --git a/MBB.py b/MBB.py index a25cad7d711f522744ac5bdd9aed2a391258fb3f..41798de7a36382634bc44564717fc102d2711220 100755 --- a/MBB.py +++ b/MBB.py @@ -561,26 +561,32 @@ iframe { os.chdir(here) + # expects a df with at least ["ERROR_EXPECTED","any_error_reported","TP","category"] # classifies as FN,FP,TN,... def classify_tests(df_in): df = df_in[["test_id", "ERROR_EXPECTED", "any_error_reported", "category", "CE", "RE", "TP"]].copy() - - df["TN"] = (df["ERROR_EXPECTED"] == False) & (df["any_error_reported"] == False) & (df["CE"] == False) & (df["RE"] == False) - df["FN"] = (df["ERROR_EXPECTED"] == True) & (df["any_error_reported"] == False) & (df["CE"] == False) & (df["RE"] == False) + + df["TN"] = (df["ERROR_EXPECTED"] == False) & (df["any_error_reported"] == False) & (df["CE"] == False) & ( + df["RE"] == False) + df["FN"] = (df["ERROR_EXPECTED"] == True) & (df["any_error_reported"] == False) & (df["CE"] == False) & ( + df["RE"] == False) df["FP"] = (((df["ERROR_EXPECTED"] == False) & df["any_error_reported"]) | # a true false positive # or a case where a not-helpful report is produced - ((df["ERROR_EXPECTED"] == True) & df["any_error_reported"] & (df["TP"] == False))) & (df["CE"] == False) & (df["RE"] == False) + ((df["ERROR_EXPECTED"] == True) & df["any_error_reported"] & (df["TP"] == False))) & ( + df["CE"] == False) & (df["RE"] == False) # so that this information is available per category df["ERROR_NOT_EXPECTED"] = (df["ERROR_EXPECTED"] == False) # every case is exactely one of this - assert df["TP"].sum() + df["FP"].sum() + df["TN"].sum() + df["FN"].sum() + df["CE"].sum() + df["RE"].sum() == len(df) + assert df["TP"].sum() + df["FP"].sum() + df["TN"].sum() + df["FN"].sum() + df["CE"].sum() + df["RE"].sum() == len( + df) assert df["ERROR_EXPECTED"].sum() + df["ERROR_NOT_EXPECTED"].sum() == len(df) return df + # aggregate metrics and calculate precision recall F1 based on this def aggregate_metrics_per_category(df_in): total_tests = len(df_in) @@ -595,7 +601,10 @@ def aggregate_metrics_per_category(df_in): df["conclusiveness"] = 1 - ((df["CE"] + df["RE"]) / total_tests) df["f1"] = (df["TP"] + df["TP"]) / (df["TP"] + df["TP"] + df["FP"] + df["FN"]) - return df[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]] + return df[ + ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", + "overallaccuracy"]] + def read_tool_reports(rootdir, toolname): if not toolname in tools: @@ -604,7 +613,7 @@ def read_tool_reports(rootdir, toolname): if not os.path.exists(f'{args.logs_dir}/{toolname}'): raise Exception(f"Not found Logs for {toolname}.") - results=[] + results = [] for test in todo: binary = re.sub(r'\.c', '', os.path.basename(test['filename'])) @@ -614,8 +623,8 @@ def read_tool_reports(rootdir, toolname): expected = test['expect'] resulting_categorization = categorize(tool=tools[toolname], toolname=toolname, - test=test, test_id=test_id, logs_dir=args.logs_dir, - ) + test=test, test_id=test_id, logs_dir=args.logs_dir, + ) resulting_categorization["test_id"] = test_id resulting_categorization["category"] = test["category"] results.append(resulting_categorization) @@ -623,18 +632,23 @@ def read_tool_reports(rootdir, toolname): df = pd.DataFrame(results) df["TP_base"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & (df["CE"] == False) & (df["RE"] == False) - df["TP_class"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & (df["CE"] == False) & (df["RE"] == False) - df["TP_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_line_reported"] & (df["CE"] == False) & (df["RE"] == False) + df["TP_class"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & ( + df["CE"] == False) & (df["RE"] == False) + df["TP_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_line_reported"] & ( + df["CE"] == False) & (df["RE"] == False) df["TP_class_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & df[ "correct_line_reported"] & (df["CE"] == False) & (df["RE"] == False) df["TP_class_line_no_class_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[ - "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_class"]) & (df["CE"] == False) & (df["RE"] == False) + "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_class"]) & ( + df["CE"] == False) & (df["RE"] == False) df["TP_class_line_no_line_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[ - "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_line"]) & (df["CE"] == False) & (df["RE"] == False) + "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_line"]) & (df["CE"] == False) & ( + df["RE"] == False) return df -def cmd_csv(rootdir, toolnames,print_to_console=False): + +def cmd_csv(rootdir, toolnames, print_to_console=False): here = os.getcwd() os.chdir(rootdir) outpath = f'{rootdir}/csv/' @@ -654,7 +668,8 @@ def cmd_csv(rootdir, toolnames,print_to_console=False): print(f"=== {toolname} ===") # Output for each type of TP - for (colname) in ["base", "class", "line", "class_line", "class_line_no_line_noise", "class_line_no_line_noise", "class_line_no_class_noise"]: + for (colname) in ["base", "class", "line", "class_line", "class_line_no_line_noise", "class_line_no_line_noise", + "class_line_no_class_noise"]: df["TP"] = df[f"TP_{colname}"] df_classified = classify_tests(df) df_classified.to_csv(f'{outpath}/{toolname}_{colname}_full.csv', index=False) @@ -662,8 +677,12 @@ def cmd_csv(rootdir, toolnames,print_to_console=False): df_result.to_csv(f'{outpath}/{toolname}_{colname}.csv', index=True) if print_to_console: print(f"\n{colname}:") - print(df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]]) - df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex(f'{outpath}/{toolname}_{colname}.tex') + print(df_result[ + ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", + "precision", "f1", "overallaccuracy"]]) + df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", + "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex( + f'{outpath}/{toolname}_{colname}.tex') df_noise_per_tool = df.groupby("category").sum() df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0) @@ -672,7 +691,7 @@ def cmd_csv(rootdir, toolnames,print_to_console=False): if print_to_console: print("overall_noise") print(df_noise_per_tool["noise_ratio"]) - df_overall_noise_ratio[toolname] =df_noise_per_tool["noise_ratio"] + df_overall_noise_ratio[toolname] = df_noise_per_tool["noise_ratio"] df_copy = df.copy() df_copy.loc[df_copy['ERROR_EXPECTED'] == False, ['num_noise_class_line', 'num_error_reports']] = 0 @@ -727,7 +746,7 @@ def plot_helpfulness(df, outpath, toolname): mpatches.Patch(color=colors[3], label='not helpful report') ] ax.legend(handles=handles, ncol=2, loc='center left', bbox_to_anchor=(0.05, -0.3)) - #ax.set_title(f"Helpfulness of {toolname.upper()} Error Reports") + # ax.set_title(f"Helpfulness of {toolname.upper()} Error Reports") ax.set_xlabel("Percentage of error reports") ax.set_ylabel("MPI feature") plt.tight_layout() @@ -1475,6 +1494,77 @@ def cmd_latex(rootdir, toolnames): os.chdir(here) +def get_overview_plot(data, outpath,scrutiny="base"): + assert len(data) > 0 + # get the column names = all the metrics tht are calculated + df_first_tool= next(iter(data.values())) + df_first_tool["TP"] = df_first_tool[f"TP_{scrutiny}"] + df_first_tool = classify_tests(df_first_tool) + df_first_tool = aggregate_metrics_per_category(df_first_tool) + cols = df_first_tool.columns + + df_coll = pd.DataFrame(columns=cols) + df_rma= pd.DataFrame(columns=cols) + df_p2p = pd.DataFrame(columns=cols) + df_total = pd.DataFrame(columns=cols) + + for toolname, df in data.items(): + df["TP"] = df[f"TP_{scrutiny}"] + df = classify_tests(df) + df = aggregate_metrics_per_category(df) + df_coll.loc[toolname] = df.loc["COLL"] + df_rma.loc[toolname] = df.loc["RMA"] + df_p2p.loc[toolname] = df.loc["P2P"] + df_total.loc[toolname] = df.loc["ALL"] + + + SMALL_SIZE = 20 + MEDIUM_SIZE = 22 + BIGGER_SIZE = 24 + + plt.rc('font', size=SMALL_SIZE) # controls default text sizes + plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title + plt.rc('axes', labelsize=SMALL_SIZE) # fontsize of the x and y labels + plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels + plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels + plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize + plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title + + fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 9)) # + + # colors = ['#228833', '#66ccee', '#ee6677', '#aa3377', '#ccbb44', '#bbbbbb'] + colors = ['#6699CC', '#EECC66', '#004488', '#997700', '#BBBBBB', '#000000'] + + ((ax1, ax2), (ax3, ax4)) = axs + df_p2p[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax1, legend=False, color=colors) + ax1.set_title('P2P') + handles, labels = ax1.get_legend_handles_labels() + + df_coll[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax2, legend=False, color=colors) + ax2.set_title('Collective') + ax2.yaxis.tick_right() + # Set the y-axis labels to uppercase + ax2.set_yticklabels([label.get_text().upper() for label in ax2.get_yticklabels()]) + + df_rma[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax3, legend=False, color=colors) + ax3.set_title('RMA') + + df_total[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax4, legend=False, color=colors) + ax4.set_title('Total') + ax4.yaxis.tick_right() + + for ax in [ax1, ax2, ax3, ax4]: + ax.set_ylabel('') + # Set the y-axis labels to uppercase + ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()]) + + fig.legend(handles, labels, loc='upper center', ncols=6, bbox_to_anchor=(0.5, 1.05), ) + + plt.tight_layout() + + plt.savefig(os.path.join(outpath, "overview_per_cat.pdf"), bbox_inches="tight") + + ######################## # cmd_plots(): what to do when '-c plots' is used (extract the statistics of this tool) ######################## @@ -1485,10 +1575,13 @@ def cmd_plots(rootdir, toolnames, ext="pdf"): os.makedirs('plots', exist_ok=True) outpath = f'{rootdir}/plots/' + collected_data = {} for toolname in toolnames: df = read_tool_reports(rootdir, toolname) - plot_helpfulness(df,outpath,toolname) + plot_helpfulness(df, outpath, toolname) + collected_data[toolname]=df + get_overview_plot(collected_data,outpath) os.chdir(here) @@ -1600,7 +1693,7 @@ elif args.c == 'plots': if not plots_loaded: print("[MBB] Error: Dependancies ('numpy' or 'matplotlib') are not available!") exit(-1) - extract_all_todo_from_logdir(args.x, args.logs_dir) + extract_all_todo_from_logdir(arg_tools[0], args.logs_dir) if args.x == 'mpirun': # toolnames=['itac', 'simgrid', 'must', 'aislinn', 'civl', 'isp', 'mpisv', 'parcoach', 'hermes', 'mpi-checker'] toolnames = ['itac', 'must', 'parcoach'] diff --git a/scripts/result_plot.py b/scripts/result_plot.py deleted file mode 100644 index b62b5ae835dba524d862eecc5ef9e8001285faf3..0000000000000000000000000000000000000000 --- a/scripts/result_plot.py +++ /dev/null @@ -1,88 +0,0 @@ -# script to generate the evaluation results plot shown in the paper - -import pandas as pd -import os - -import matplotlib.pyplot as plt -import seaborn as sns - -sns.set_theme() -sns.set_style("whitegrid") - -# input path -input_path = "/home/tim/mpi-bugbench/logs/mpi-bugbench-results/logs-20240723-151721/csv" -# output path -plot_path = "/home/tim/paper/2024_eurompi_mpi-bugbench-paper/media" - -df_itac = pd.read_csv(os.path.join(input_path, "itac_base.csv"), index_col=0) -df_parcoach = pd.read_csv(os.path.join(input_path, "parcoach_base.csv"), index_col=0) -df_must = pd.read_csv(os.path.join(input_path, "must_base.csv"), index_col=0) - -df_coll = pd.DataFrame(columns=df_itac.columns) -df_coll.loc["MUST"] = df_must.loc["COLL"] -df_coll.loc["ITAC"] = df_itac.loc["COLL"] -df_coll.loc["PARCOACH"] = df_parcoach.loc["COLL"] - -df_p2p = pd.DataFrame(columns=df_itac.columns) -df_p2p.loc["MUST"] = df_must.loc["P2P"] -df_p2p.loc["ITAC"] = df_itac.loc["P2P"] -df_p2p.loc["PARCOACH"] = df_parcoach.loc["P2P"] - -df_rma = pd.DataFrame(columns=df_itac.columns) -df_rma.loc["MUST"] = df_must.loc["RMA"] -df_rma.loc["ITAC"] = df_itac.loc["RMA"] -df_rma.loc["PARCOACH"] = df_parcoach.loc["RMA"] - -df_total = pd.DataFrame(columns=df_itac.columns) -df_total.loc["MUST"] = df_must.loc["ALL"] -df_total.loc["ITAC"] = df_itac.loc["ALL"] -df_total.loc["PARCOACH"] = df_parcoach.loc["ALL"] - -SMALL_SIZE = 20 -MEDIUM_SIZE = 22 -BIGGER_SIZE = 24 - -plt.rc('font', size=SMALL_SIZE) # controls default text sizes -plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title -plt.rc('axes', labelsize=SMALL_SIZE) # fontsize of the x and y labels -plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels -plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels -plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize -plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title - -fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 9)) # - -#colors = ['#228833', '#66ccee', '#ee6677', '#aa3377', '#ccbb44', '#bbbbbb'] -colors = ['#6699CC', '#EECC66', '#004488', '#997700', '#BBBBBB', '#000000'] -colors_2 = {'TP': '#117733', 'TN': '#88CCEE', 'FP': '#CC6677', 'FN': '#AA4499', 'RE': '#DDCC77', 'CE': '#DDDDDD'} - -hatches= ["","",".",".","",""] - -((ax1, ax2), (ax3, ax4)) = axs -df_p2p[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax1, legend=False,color=colors) -ax1.set_title('P2P') -handles, labels = ax1.get_legend_handles_labels() - -df_coll[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax2, legend=False, color=colors) -ax2.set_title('Collective') -ax2.yaxis.tick_right() -# Set the y-axis labels to uppercase -ax2.set_yticklabels([label.get_text().upper() for label in ax2.get_yticklabels()]) - -df_rma[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax3, legend=False, color=colors) -ax3.set_title('RMA') - -df_total[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax4, legend=False, color=colors) -ax4.set_title('Total') -ax4.yaxis.tick_right() - -for ax in [ax1, ax2, ax3, ax4]: - ax.set_ylabel('') - # Set the y-axis labels to uppercase - ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()]) - -fig.legend(handles, labels, loc='upper center', ncols=6, bbox_to_anchor=(0.5, 1.05), ) - -plt.tight_layout() - -plt.savefig(os.path.join(plot_path, "results_per_cat.pdf"), bbox_inches="tight")