From 4e2e7a60c7d0bf82751b3437f468fcc739460fc2 Mon Sep 17 00:00:00 2001
From: Tim Jammer <tim.jammer@tu-darmstadt.de>
Date: Wed, 31 Jul 2024 10:39:55 +0200
Subject: [PATCH] changed noise table to csv output

---
 MBI.py | 60 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/MBI.py b/MBI.py
index 48900c88e..6ffd7dbf6 100755
--- a/MBI.py
+++ b/MBI.py
@@ -638,36 +638,24 @@ def cmd_helpfulness(rootdir, toolnames):
     pass
 
 
-def cmd_csv(rootdir, toolnames):
+def cmd_csv(rootdir, toolnames,print_to_console=False):
     here = os.getcwd()
     os.chdir(rootdir)
-    used_toolnames = []
     outpath = f'{rootdir}/csv/'
 
     # Create directory for output if not present
     pathlib.Path(outpath).mkdir(parents=True, exist_ok=True)
 
-
-    # select the tools for which we have some results
-    print("Produce the stats for:", end='')
-    for toolname in toolnames:
-        if not toolname in tools:
-            raise Exception(f"Tool {toolname} does not seem to be a valid name.")
-
-        if os.path.exists(f'{args.logs_dir}/{toolname}'):
-            used_toolnames.append(toolname)
-            print(f' {toolname}', end="")
-
-    print(".")
-
-
+    df_noise_ratio = pd.DataFrame(columns=toolnames)
+    df_overall_noise_ratio = pd.DataFrame(columns=toolnames)
 
     pd.set_option('display.max_columns', 14)
 
     for toolname in toolnames:
         df = read_tool_reports(rootdir, toolname)
         df.to_csv(f'{outpath}/{toolname}_raw.csv', index=False)
-        print(f"=== {toolname} ===")
+        if print_to_console:
+            print(f"=== {toolname} ===")
 
         # Output for each type of TP
         for (colname) in ["base", "class", "line", "class_line", "class_line_no_line_noise", "class_line_no_line_noise", "class_line_no_class_noise"]:
@@ -676,27 +664,35 @@ def cmd_csv(rootdir, toolnames):
             df_classified.to_csv(f'{outpath}/{toolname}_{colname}_full.csv', index=False)
             df_result = aggregate_metrics_per_category(df_classified)
             df_result.to_csv(f'{outpath}/{toolname}_{colname}.csv', index=True)
-            print(f"\n{colname}:")
-            print(df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]])
-            df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex(f'{outpath}/{toolname}_{colname}.tex')
+            if print_to_console:
+                print(f"\n{colname}:")
+                print(df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]])
+                df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex(f'{outpath}/{toolname}_{colname}.tex')
 
         plot_helpfulness(df, outpath, toolname)
 
-        df_plot = df.groupby("category").sum()
-        df_plot.loc["ALL"] = df_plot.sum(axis=0)
-        df_plot.drop("other", axis=0, inplace=True)
-        df_plot["noise_ratio"] = df_plot["num_noise_line"] / df_plot["num_error_reports"]
-        print("overall_noise")
-        print(df_plot["noise_ratio"])
+        df_noise_per_tool = df.groupby("category").sum()
+        df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0)
+        df_noise_per_tool.drop("other", axis=0, inplace=True)
+        df_noise_per_tool["noise_ratio"] = df_noise_per_tool["num_noise_line"] / df_noise_per_tool["num_error_reports"]
+        if print_to_console:
+            print("overall_noise")
+            print(df_noise_per_tool["noise_ratio"])
+        df_overall_noise_ratio[toolname] =df_noise_per_tool["noise_ratio"]
 
         df_copy = df.copy()
         df_copy.loc[df_copy['ERROR_EXPECTED'] == False, ['num_noise_class_line', 'num_error_reports']] = 0
-        df_plot = df_copy.groupby("category").sum()
-        df_plot.loc["ALL"] = df_plot.sum(axis=0)
-        df_plot.drop("other", axis=0, inplace=True)
-        df_plot["noise_ratio"] = df_plot["num_noise_line"] / df_plot["num_error_reports"]
-        print("noise_in_cases_where_errors_are_present")
-        print(df_plot[["noise_ratio", "num_noise_class_line", "num_error_reports"]])
+        df_noise_per_tool = df_copy.groupby("category").sum()
+        df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0)
+        df_noise_per_tool.drop("other", axis=0, inplace=True)
+        df_noise_per_tool["noise_ratio"] = df_noise_per_tool["num_noise_line"] / df_noise_per_tool["num_error_reports"]
+        if print_to_console:
+            print("noise_in_cases_where_errors_are_present")
+            print(df_noise_per_tool[["noise_ratio", "num_noise_class_line", "num_error_reports"]])
+        df_noise_ratio[toolname] = df_noise_per_tool["noise_ratio"]
+
+    df_noise_ratio.to_csv(f'{outpath}/noise.csv')
+    df_overall_noise_ratio.to_csv(f'{outpath}/overall_noise_including_unexpected.csv')
 
 
 def plot_helpfulness(df, outpath, toolname):
-- 
GitLab