MBB.py

#! /usr/bin/python3

# autopep8 -i --max-line-length 130 MBB.py

import shutil
import os
import signal
import sys
import stat
import re
import argparse
import time
import glob
import subprocess
import statistics
import multiprocessing as mp
import pathlib
import logging
import pandas
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from scripts.MBButils import categorize

mpl.rcParams['hatch.linewidth'] = 4.5  # hatch linewidth

# Add our lib directory to the PYTHONPATH, and load our utilitary libraries
sys.path.append(f'{os.path.dirname(os.path.abspath(__file__))}/scripts')

from MBButils import *
from LaTeXutils import *

# Plots need big dependancy like numpy and matplotlib, so just ignore
# the import if dependencies are not available.
plots_loaded = False
try:
    from tools.gen_plots_radar import *

    plots_loaded = True
except ImportError:
    print("[MBB] Warning: ImportError for the plots module.")

import tools.parcoach
import tools.simgrid
import tools.smpi  # SimGrid without MC
import tools.smpivg  # SimGrid with valgrind instead of MC
import tools.must
import tools.mpisv
import tools.hermes
import tools.isp

itac_loaded = False
try:
    import tools.itac

    itac_loaded = True
except ImportError:
    print(
        "[MBB] Warning: ITAC module cannot be loaded because of an ImportError (that's OK if you did not plan to use it).")
import tools.civl
import tools.aislinn
import tools.mpi_checker

tools = {'aislinn': tools.aislinn.Tool(), 'civl': tools.civl.Tool(), 'hermes': tools.hermes.Tool(),
         'isp': tools.isp.Tool(), 'mpisv': tools.mpisv.Tool(),
         'itac': tools.itac.Tool() if itac_loaded else None,
         'must': tools.must.V18(),  # 'must17': tools.must.V17(), # This one is deprecated, and no RC release right now
         'simgrid': tools.simgrid.Tool(), 'simgrid-3.27': tools.simgrid.v3_27(), 'simgrid-3.28': tools.simgrid.v3_28(),
         'simgrid-3.29': tools.simgrid.v3_29(), 'simgrid-3.30': tools.simgrid.v3_30(),
         'simgrid-3.31': tools.simgrid.v3_31(), 'simgrid-3.32': tools.simgrid.v3_32(),
         'smpi': tools.smpi.Tool(), 'smpivg': tools.smpivg.Tool(), 'parcoach': tools.parcoach.Tool(),
         'mpi-checker': tools.mpi_checker.Tool()}

# Some scripts may fail if error messages get translated
os.environ["LC_ALL"] = "C"

# BufferLength/BufferOverlap
# RMA concurrency errors (local and distributed)

########################
# Extract the TODOs from the codes
########################
todo = []


def extract_all_todo(batch):
    """Extract the TODOs from all existing files, applying the batching request"""
    if os.path.exists(f"/MBB/scripts/{args.gencodes}/"):  # Docker run
        filenames = glob.glob(f"/MBB/scripts/{args.gencodes}/**/*.c")
    elif os.path.exists(f"{args.gencodes}/"):  # Gitlab-ci run
        filenames = glob.glob(f"{os.getcwd()}/{args.gencodes}/*.c")  # our code expects absolute paths
    elif os.path.exists(f"../../{args.gencodes}/"):  # Local runs
        filenames = glob.glob(f"{os.getcwd()}/../../{args.gencodes}/*.c")  # our code expects absolute paths
    else:
        subprocess.run("ls ../..", shell=True)
        raise Exception(
            f"Cannot find the input codes (cwd: {os.getcwd()}). Did you run the original_MBI_generators before running the tests?")
    # Choose the files that will be used by this runner, depending on the -b argument
    match = re.match(r'(\d+)/(\d+)', batch)
    if not match:
        print(
            f"The parameter to batch option ({batch}) is invalid. Must be something like 'N/M', with N and M numbers.")
    pos = int(match.group(1))
    runner_count = int(match.group(2))
    assert pos > 0
    assert pos <= runner_count
    batch = int(len(filenames) / runner_count) + 1
    min_rank = batch * (pos - 1)
    max_rank = (batch * pos) - 1
    print(f'Handling files from #{min_rank} to #{max_rank}, out of {len(filenames)} in {os.getcwd()}')

    global todo
    filename = sorted(filenames)
    for filename in filenames[min_rank:max_rank]:
        todo = todo + parse_one_code(filename)
    if pos == runner_count and pos != 1:  # The last runner starts from the end of the array to ease dynamically splitting
        todo = list(reversed(todo))


def extract_all_todo_from_logdir(tool, logdir):
    """Extract the TODOs from the given logdir"""
    if os.path.exists(logdir):
        filenames = glob.glob(f"{logdir}/{tool}/*.c")
    else:
        raise Exception(
            f"Cannot find the input codes ({logdir}). Did you run the original_MBI_generators before running the tests?")

    global todo
    filename = sorted(filenames)
    for filename in filenames:
        todo = todo + parse_one_code(filename)
    print(todo)


########################
# cmd_gencodes(): what to do when '-c generate -lev <1|2|>' is used (Generating the codes)
########################

def cmd_gencodes(level):
    here = os.getcwd()
    if os.path.exists("/MBB/scripts/errors/"):
        #level = 2
        print(f"Generate level {level}")
        subprocess.run(f'tar -xf real_world_data.csv.tar.gz', shell=True, check=True)
        subprocess.run(f'cd scripts && python3 generate.py --generator_dir errors --level {level} --real_world_data ../output.csv --remove_previous_generation_results', shell=True, check=True)
    else:
        raise Exception("Cannot find the generators. Please report that bug.")
    os.chdir(here)


   # if os.path.exists("/MBI/scripts/original_MBI_generators/CollArgGenerator.py"):  # Docker run
   #     print("Docker run")
   #     generators = glob.glob("/MBI/scripts/original_MBI_generators/*Generator.py")
   #     dir = "/MBI/gencodes"
   # elif os.path.exists("../../scripts/original_MBI_generators/CollArgGenerator.py"):  # Local run, from logs dir
   #     print("Local run, from tools' logs dir")
   #     generators = glob.glob(f"{os.getcwd()}/../../scripts/original_MBI_generators/*Generator.py")
   #     dir = "../../gencodes/"
   # elif os.path.exists("scripts/original_MBI_generators/CollArgGenerator.py"):  # Local run, from main dir
   #     print("Local run, from MBI main dir")
   #     generators = glob.glob(f"{os.getcwd()}/scripts/original_MBI_generators/*Generator.py")
   #     dir = "gencodes/"
   # else:
   #     raise Exception("Cannot find the codes' original_MBI_generators. Please report that bug.")
   # subprocess.run(f"rm -rf {dir} ; mkdir {dir}", shell=True, check=True)
   # here = os.getcwd()
   # os.chdir(dir)
   # print(f"Generate the codes (in {os.getcwd()}): ", end='')
   # for generator in generators:
   #     m = re.match("^.*?/([^/]*)Generator.py$", generator)
   #     if m:
   #         print(m.group(1), end=", ")
   #     else:
   #         print(generator, end=", ")
   #     subprocess.run(f'../scripts/ensure_python3 {generator}', shell=True, check=True)
   # print("\nTest count: ", end='')
   # sys.stdout.flush()
   # subprocess.run("ls *.c|wc -l", shell=True, check=True)
   # subprocess.run("for n in *.c ; do cat -n $n > $n.txt ; done", shell=True, check=True)
   # os.chdir(here)


########################
# cmd_build(): what to do when '-c build' is used (building the tool, discarding the cache)
########################
def cmd_build(rootdir, toolname):
    # Basic verification
    tools[toolname].ensure_image()

    # Build the tool on need
    tools[toolname].build(rootdir=rootdir, cached=False)


########################
# cmd_run(): what to do when '-c run' is used (running the tests)
########################
def cmd_run(rootdir, toolname, batchinfo):
    # Go to the tools' logs directory on need
    rootdir = os.path.dirname(os.path.abspath(__file__))
    os.makedirs(f'{rootdir}/{args.logs_dir}/{toolname}', exist_ok=True)
    os.chdir(f'{rootdir}/{args.logs_dir}/{toolname}')
    print(f"Run tool {toolname} from {os.getcwd()} (batch {batchinfo}).")

    tools[toolname].set_rootdir(rootdir)

    # Basic verification
    tools[toolname].ensure_image()

    # Build the tool on need
    tools[toolname].build(rootdir=rootdir)

    # build list of test executions for run function
    work_items = []
    for number, test in enumerate(todo):
        binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
        work_items.append(
            (test['cmd'], test['filename'], binary, test['id'], number, args.timeout, batchinfo, args.loglevel))

    with mp.Pool(args.nworkers) as pool:
        pool.starmap(tools[toolname].run, work_items)
    # for test in todo:
    #     binary = re.sub('\.c', '', os.path.basename(test['filename']))

    #     print(f"\nTest #{count} out of {len(todo)}: '{binary}_{test['id']} '", end="... ")
    #     count += 1
    #     sys.stdout.flush()

    #     p = mp.Process(target=tools[toolname].run, args=(test['cmd'], test['filename'], binary, test['id'], args.timeout, batchinfo))
    #     with mp.Pool(5) as pool:
    #         pool.starmap()
    #     p.start()
    #     sys.stdout.flush()
    #     p.join(args.timeout+60)
    #     if p.is_alive():
    #         print("HARD TIMEOUT! The child process failed to timeout by itself. Sorry for the output.")
    #         p.terminate()

    tools[toolname].teardown()


########################
# cmd_html(): what to do when '-c html' is used (extract the statistics of this tool)
########################
def percent(num, den, compl=False, one=False, digits=4):
    """Returns the ratio of num/den as a percentage, rounded to N digits only (default: 4). If one=True, then return a ratio of 1 with 4 digits"""
    if den == 0:
        return "(error)"
    elif compl:  # Complementary
        res = round(100 - num / den * 100, digits - 2)
    else:
        res = round(num / den * 100, 2)
    if int(res) == 100:
        return "1" if one else "100"
    return round(res / 100, digits) if one else res


def bold_if(val, target):
    """Returns the value as a bold LaTeX string if it equals the target, or unchanged otherwise."""
    if str(val) == str(target):
        return f'{{\\bf {val}}}'
    return str(val)


def seconds2human(secs):
    """Returns the amount of seconds in human-friendly way"""
    days = int(secs // 86400)
    hours = int((secs - days * 86400) // 3600)
    minutes = int((secs - days * 86400 - hours * 3600) // 60)
    seconds = secs - days * 86400 - hours * 3600 - minutes * 60
    return (f"{days} days, " if days else "") + (f"{hours} hours, " if hours else "") + (
        f"{minutes} minutes, " if minutes else "") + (f"{int(seconds * 100) / 100} seconds" if seconds else "")


def cmd_html(rootdir, toolnames=[]):
    here = os.getcwd()
    os.chdir(rootdir)
    results = {}
    total_elapsed = {}
    used_toolnames = []
    for toolname in toolnames:
        if not toolname in tools:
            raise Exception(f"Tool {toolname} does not seem to be a valid name.")

        if os.path.exists(f'{args.logs_dir}/{toolname}'):
            used_toolnames.append(toolname)
            # To compute statistics on the performance of this tool
            results[toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [], 'TRUE_NEG': [],
                                 'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': []}

            # To compute timing statistics
            total_elapsed[toolname] = 0

    ########################
    # Analyse each test, grouped by expectation, and all tools for a given test
    ########################
    with open(f"{rootdir}/index.html", "w") as outHTML:
        outHTML.write("""
<html><head><title>MBI results</title></head>
<script>
iframe {
  resize: both;
  overflow: auto;
}
</script>
<body>
<iframe width="100%" height="45%" src="summary.html"></iframe>
<iframe width="100%" height="55%" name="MBB_details"></iframe>
</body></html>
""")

    with open(f"{rootdir}/summary.html", "w") as outHTML:
        outHTML.write(f"<html><head><title>MBB outcomes for all tests</title></head>\n")
        outHTML.write("""
<style>
.tooltip {
  position: relative;
  display: inline-block;
  border-bottom: 1px dotted black; /* If you want dots under the hoverable text */
}

.tooltip .tooltiptext {
  visibility: hidden;
  width: 120px;
  background-color: #555;
  color: #fff;
  text-align: center;
  border-radius: 6px;
  padding: 5px 0;
  position: absolute;
  z-index: 1;
  bottom: 125%;
  left: 50%;
  margin-left: -60px;
  opacity: 0;
  transition: opacity 0.3s;
}

.tooltip .tooltiptext::after {
  content: "";
  position: absolute;
  top: 100%;
  left: 50%;
  margin-left: -5px;
  border-width: 5px;
  border-style: solid;
  border-color: #555 transparent transparent transparent;
}

.tooltip:hover .tooltiptext {
  visibility: visible;
  opacity: 1;
}
</style>
<body>
""")

        # Generate the table of contents
        previous_detail = ''  # To open a new section for each possible detailed outcome
        outHTML.write("<h2>Table of contents</h2>\n<ul>\n")
        for test in sorted(todo,
                           key=lambda t: f"{possible_details[t['detail']]}|{t['detail']}|{t['filename']}|{t['id']}"):
            if previous_detail != possible_details[test['detail']]:
                if previous_detail != '':  # Close the previous item, if we are not generating the first one
                    outHTML.write(f" </li>\n")
                previous_detail = possible_details[test['detail']]
                if test['detail'] != 'OK':
                    outHTML.write(
                        f" <li><a href='#{possible_details[test['detail']]}'>{displayed_name[possible_details[test['detail']]]}</a> (scope: {error_scope[possible_details[test['detail']]]})\n")
                else:
                    outHTML.write(f" <li><a href='#OK'>{displayed_name[possible_details[test['detail']]]}</a>\n")

        outHTML.write("  </ul>\n <li><a href='#metrics'>Summary metrics</a></li></ul>\n")

        # Generate the actual content
        previous_detail = ''  # To open a new section for each possible detailed outcome
        testcount = 0  # To repeat the table header every 25 lines
        for test in sorted(todo,
                           key=lambda t: f"{possible_details[t['detail']]}|{t['detail']}|{t['filename']}|{t['id']}"):
            testcount += 1
            if previous_detail != possible_details[test['detail']] or testcount == 25:
                if testcount != 25:  # Write the expected outcome only once, not every 25 tests
                    if previous_detail != '':  # Close the previous table, if we are not generating the first one
                        outHTML.write(f"</table>\n")
                    previous_detail = possible_details[test['detail']]
                    if test['detail'] != 'OK':
                        outHTML.write(
                            f"  <a name='{possible_details[test['detail']]}'/><h3>{displayed_name[possible_details[test['detail']]]} errors (scope: {error_scope[possible_details[test['detail']]]})</h3>\n")
                    else:
                        outHTML.write(f"  <a name='OK'/><h3>Correct codes</h3>\n")

                    outHTML.write('  <table border=1>\n')
                testcount = 0
                outHTML.write("   <tr><td>Test</td>")
                for toolname in used_toolnames:
                    outHTML.write(f"<td>&nbsp;{displayed_name[toolname]}&nbsp;</td>")
                outHTML.write(f"</tr>\n")
            outHTML.write(f"     <tr>")

            binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
            ID = test['id']
            test_id = f"{binary}_{ID}"
            expected = test['expect']

            outHTML.write(
                f"<td><a href='{test['filename']}' target='MBB_details'>{binary}</a>&nbsp;<a href='{test['filename']}'><img title='Download source' src='img/html.svg' height='24' /></a>")
            if ID != 0:
                outHTML.write(f' (test {ID + 1}) ')
            outHTML.write("</td>")

            for toolname in used_toolnames:
                (res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname,
                                                                          test_id=test_id, logs_dir=args.logs_dir,
                                                                          expected=expected, autoclean=True)

                results[toolname][res_category].append(f"{test_id} expected {test['detail']}, outcome: {diagnostic}")
                outHTML.write(
                    f"<td align='center'><a href='{args.logs_dir}/{toolname}/{test_id}.txt' target='MBB_details'><img title='{displayed_name[toolname]} {diagnostic} (returned {outcome})' src='img/{res_category}.svg' width='24' /></a> ({outcome})")
                extra = None

                report = []
                for root, dirs, files in os.walk(f"{args.logs_dir}/{toolname}/{test_id}"):
                    if "index.html" in files:
                        report.append(os.path.join(root, "index.html"))

                if len(report) > 0:
                    extra = f'{args.logs_dir}/' + report[0].split(f'{args.logs_dir}/')[1]
                if os.path.exists(f'{args.logs_dir}/{toolname}/{test_id}.html'):
                    extra = f'{args.logs_dir}/{toolname}/{test_id}.html'
                if os.path.exists(f'{args.logs_dir}/{toolname}/{test_id}-klee-out'):  # MPI-SV
                    extra = f'{args.logs_dir}/{toolname}/{test_id}-klee-out'

                if extra is not None:
                    outHTML.write(
                        f"&nbsp;<a href='{extra}' target='MBB_details'><img title='more info' src='img/html.svg' height='24' /></a>")
                outHTML.write("</td>")

                if res_category != 'timeout' and elapsed is not None:
                    total_elapsed[toolname] += float(elapsed)

                if len(used_toolnames) == 1:
                    print(f"Test '{test_id}' result: {res_category}: {diagnostic}. Elapsed: {elapsed} sec")

                np = re.search(r"(?:-np) [0-9]+", test['cmd'])
                np = int(re.sub(r"-np ", "", np.group(0)))

            outHTML.write(f"</tr>\n")
        outHTML.write(f"</table>\n")

        # Display summary metrics for each tool
        def tool_stats(toolname):
            return (
                len(results[toolname]['TRUE_POS']), len(results[toolname]['TRUE_NEG']),
                len(results[toolname]['FALSE_POS']),
                len(results[toolname]['FALSE_NEG']), len(results[toolname]['unimplemented']),
                len(results[toolname]['failure']), len(results[toolname]['timeout']), len(results[toolname]['other']))

        outHTML.write("\n<a name='metrics'/><h2>Metrics</h2><table border=1>\n<tr><td/>\n")
        for toolname in used_toolnames:
            outHTML.write(f"<td>{displayed_name[toolname]}</td>")

        outHTML.write("</tr>\n<tr><td>API coverage</td>")
        for toolname in used_toolnames:
            (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
            total = TP + TN + FP + FN + nTout + nPort + nFail + nNocc
            outHTML.write(
                f"<td><div class='tooltip'>{percent(nPort, total, compl=True)}% <span class='tooltiptext'>{nPort} unimplemented calls, {nNocc} inconclusive runs out of {total}</span></div></td>")

        outHTML.write("</tr>\n<tr><td>Robustness</td>")
        for toolname in used_toolnames:
            (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
            totalPort = TP + TN + FP + FN + nTout + nFail
            outHTML.write(
                f"<td><div class='tooltip'>{percent((nTout + nFail), (totalPort), compl=True)}% <span class='tooltiptext'>{nTout} timeouts, {nFail} failures out of {totalPort}</span></div></td>")

        outHTML.write("</tr>\n<tr><td>Recall</td>")
        for toolname in used_toolnames:
            (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
            outHTML.write(
                f"<td><div class='tooltip'>{percent(TP, (TP + FN))}% <span class='tooltiptext'>found {TP} errors out of {TP + FN}</span></div></td>")
        outHTML.write("</tr>\n<tr><td>Specificity</td>")
        for toolname in used_toolnames:
            (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
            outHTML.write(
                f"<td><div class='tooltip'>{percent(TN, (TN + FP))}%  <span class='tooltiptext'>recognized {TN} correct codes out of {TN + FP}</span></div></td>")
        outHTML.write("</tr>\n<tr><td>Precision</td>")
        for toolname in used_toolnames:
            (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
            outHTML.write(
                f"<td><div class='tooltip'>{percent(TP, (TP + FP))}% <span class='tooltiptext'>{TP} diagnostics of error are correct out of {TP + FP})</span></div></td>")
        outHTML.write("</tr>\n<tr><td>Accuracy</td>")
        for toolname in used_toolnames:
            (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
            outHTML.write(
                f"<td><div class='tooltip'>{percent((TP + TN), (TP + TN + FP + FN))}% <span class='tooltiptext'>{TP + TN} correct diagnostics in total, out of {TP + TN + FP + FN} diagnostics</span></div></td>")
        outHTML.write("</tr></table>")
        outHTML.write(
            "<p>Hover over the values for details. API coverage issues, timeouts and failures are not considered when computing the other metrics, thus differences in the total amount of tests.</p>")

        # Add generate radar plots
        if plots_loaded:
            for toolname in used_toolnames:
                outHTML.write(
                    f'<img src="plots/ext_radar_all_{toolname}.svg" alt="Radar plot for all error type for the {displayed_name[toolname]} tool."\\>')

        outHTML.write(f"</body></html>\n")

    ########################
    # Per tool statistics summary
    ########################
    for toolname in used_toolnames:
        TP = len(results[toolname]['TRUE_POS'])
        TN = len(results[toolname]['TRUE_NEG'])
        FP = len(results[toolname]['FALSE_POS'])
        FN = len(results[toolname]['FALSE_NEG'])
        nPort = len(results[toolname]['unimplemented'])
        nFail = len(results[toolname]['failure'])
        other = len(results[toolname]['other'])
        nTout = len(results[toolname]['timeout'])
        passed = TP + TN
        total = passed + FP + FN + nTout + nPort + nFail + other

        print(f"XXXXXXXXX Final results for {toolname}")
        if FP > 0:
            print(f"XXX {FP} false positives")
            if len(used_toolnames) == 1:
                for p in results[toolname]['FALSE_POS']:
                    print(f"  {p}")
        if FN > 0:
            print(f"XXX {FN} false negatives")
            if len(used_toolnames) == 1:
                for p in results[toolname]['FALSE_NEG']:
                    print(f"  {p}")
        if nTout > 0:
            print(f"XXX {nTout} timeouts")
            if len(used_toolnames) == 1:
                for p in results[toolname]['timeout']:
                    print(f"  {p}")
        if nPort > 0:
            print(f"XXX {nPort} API coverage issues")
            if len(used_toolnames) == 1:
                for p in results[toolname]['unimplemented']:
                    print(f"  {p}")
        if nFail > 0:
            print(f"XXX {nFail} tool failures")
            if len(used_toolnames) == 1:
                for p in results[toolname]['failure']:
                    print(f"  {p}")
        if other > 0:
            print(f"XXX {nFail} inconclusive runs (output parsing failure)")
            if len(used_toolnames) == 1:
                for p in results[toolname]['other']:
                    print(f"  {p}")

        print(f"\nXXXX Summary for {toolname} XXXX  {passed} test{'' if passed == 1 else 's'} passed (out of {total})")
        print(f"\nFP = {FP}  FN = {FN}  TP = {TP}  TN = {TN}")
        print(f"\nCE = {nPort}  TO = {nTout}  RE = {nFail}")
        print(f"API coverage: {percent(nPort, total, compl=True)}% ({nPort} tests failed out of {total})")
        print(
            f"Robustness: {percent((nTout + nFail), (total - nPort), compl=True)}% ({nTout} timeouts and {nFail} failures out of {total - nPort})\n")

        print(f"Recall: {percent(TP, (TP + FN))}% (found {TP} errors out of {TP + FN})")
        print(f"Specificity: {percent(TN, (TN + FP))}% (recognized {TN} correct codes out of {TN + FP})")
        print(f"Precision: {percent(TP, (TP + FP))}% ({TP} diagnostic of error are correct out of {TP + FP})")
        print(
            f"Accuracy: {percent((TP + TN), (TP + TN + FP + FN))}% ({TP + TN} correct diagnostics in total, out of {TP + TN + FP + FN} diagnostics)")
        print(
            f"\nTotal time of {toolname} for all tests (not counting the timeouts): {seconds2human(total_elapsed[toolname])} ({total_elapsed[toolname]} seconds)")

    os.chdir(here)


# expects a df with at least ["ERROR_EXPECTED","any_error_reported","TP","category"]
# classifies as FN,FP,TN,...
def classify_tests(df_in):
    df = df_in[["test_id", "ERROR_EXPECTED", "any_error_reported", "category", "CE", "RE", "TP"]].copy()

    df["TN"] = (df["ERROR_EXPECTED"] == False) & (df["any_error_reported"] == False) & (df["CE"] == False) & (
                df["RE"] == False)
    df["FN"] = (df["ERROR_EXPECTED"] == True) & (df["any_error_reported"] == False) & (df["CE"] == False) & (
                df["RE"] == False)
    df["FP"] = (((df["ERROR_EXPECTED"] == False) & df["any_error_reported"]) |  # a true false positive
                # or a case where a not-helpful report is produced
                ((df["ERROR_EXPECTED"] == True) & df["any_error_reported"] & (df["TP"] == False))) & (
                           df["CE"] == False) & (df["RE"] == False)

    # so that this information is available per category
    df["ERROR_NOT_EXPECTED"] = (df["ERROR_EXPECTED"] == False)

    # every case is exactely one of this
    assert df["TP"].sum() + df["FP"].sum() + df["TN"].sum() + df["FN"].sum() + df["CE"].sum() + df["RE"].sum() == len(
        df)
    assert df["ERROR_EXPECTED"].sum() + df["ERROR_NOT_EXPECTED"].sum() == len(df)

    return df


# aggregate metrics and calculate precision recall F1 based on this
def aggregate_metrics_per_category(df_in):
    total_tests = len(df_in)
    df = df_in.groupby(["category"]).sum()
    df.loc["ALL"] = df.sum(axis=0)

    df["recall"] = df["TP"] / (df["ERROR_EXPECTED"])
    df["precision"] = df["TP"] / (df["TP"] + df["FP"])
    df["specificity"] = df["TN"] / (df["ERROR_NOT_EXPECTED"])
    df["overallaccuracy"] = (df["TP"] + df["TN"]) / total_tests
    df["coverage"] = 1 - (df["CE"]) / total_tests
    df["conclusiveness"] = 1 - ((df["CE"] + df["RE"]) / total_tests)
    df["f1"] = (df["TP"] + df["TP"]) / (df["TP"] + df["TP"] + df["FP"] + df["FN"])

    return df[
        ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1",
         "overallaccuracy"]]


def read_tool_reports(rootdir, toolname):
    if not toolname in tools:
        raise Exception(f"Tool {toolname} does not seem to be a valid name.")

    if not os.path.exists(f'{args.logs_dir}/{toolname}'):
        raise Exception(f"Not found Logs for {toolname}.")

    results = []

    for test in todo:
        binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
        ID = test['id']
        test_category = test['category']
        test_id = f"{binary}_{ID}"
        expected = test['expect']

        resulting_categorization = categorize(tool=tools[toolname], toolname=toolname,
                                              test=test, test_id=test_id, logs_dir=args.logs_dir,
                                              )
        resulting_categorization["test_id"] = test_id
        resulting_categorization["category"] = test["category"]
        results.append(resulting_categorization)

    df = pd.DataFrame(results)

    df["TP_base"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & (df["CE"] == False) & (df["RE"] == False)
    df["TP_class"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & (
                df["CE"] == False) & (df["RE"] == False)
    df["TP_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_line_reported"] & (
                df["CE"] == False) & (df["RE"] == False)
    df["TP_class_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & df[
        "correct_line_reported"] & (df["CE"] == False) & (df["RE"] == False)
    df["TP_class_line_no_class_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[
        "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_class"]) & (
                                                     df["CE"] == False) & (df["RE"] == False)
    df["TP_class_line_no_line_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[
        "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_line"]) & (df["CE"] == False) & (
                                                    df["RE"] == False)

    return df


def cmd_csv(rootdir, toolnames, print_to_console=False):
    here = os.getcwd()
    os.chdir(rootdir)
    outpath = f'{rootdir}/csv/'

    # Create directory for output if not present
    pathlib.Path(outpath).mkdir(parents=True, exist_ok=True)

    df_noise_ratio = pd.DataFrame(columns=toolnames)
    df_overall_noise_ratio = pd.DataFrame(columns=toolnames)

    pd.set_option('display.max_columns', 14)

    for toolname in toolnames:
        df = read_tool_reports(rootdir, toolname)
        df.to_csv(f'{outpath}/{toolname}_raw.csv', index=False)
        if print_to_console:
            print(f"=== {toolname} ===")

        # Output for each type of TP
        for (colname) in ["base", "class", "line", "class_line", "class_line_no_line_noise", "class_line_no_line_noise",
                          "class_line_no_class_noise"]:
            df["TP"] = df[f"TP_{colname}"]
            df_classified = classify_tests(df)
            df_classified.to_csv(f'{outpath}/{toolname}_{colname}_full.csv', index=False)
            df_result = aggregate_metrics_per_category(df_classified)
            df_result.to_csv(f'{outpath}/{toolname}_{colname}.csv', index=True)
            if print_to_console:
                print(f"\n{colname}:")
                print(df_result[
                          ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall",
                           "precision", "f1", "overallaccuracy"]])
                df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall",
                           "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex(
                    f'{outpath}/{toolname}_{colname}.tex')

        df_noise_per_tool = df.groupby("category").sum()
        df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0)
        df_noise_per_tool.drop("other", axis=0, inplace=True)
        df_noise_per_tool["noise_ratio"] = df_noise_per_tool["num_noise_line"] / df_noise_per_tool["num_error_reports"]
        if print_to_console:
            print("overall_noise")
            print(df_noise_per_tool["noise_ratio"])
        df_overall_noise_ratio[toolname] = df_noise_per_tool["noise_ratio"]

        df_copy = df.copy()
        df_copy.loc[df_copy['ERROR_EXPECTED'] == False, ['num_noise_class_line', 'num_error_reports']] = 0
        df_noise_per_tool = df_copy.groupby("category").sum()
        df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0)
        df_noise_per_tool.drop("other", axis=0, inplace=True)
        df_noise_per_tool["noise_ratio"] = df_noise_per_tool["num_noise_line"] / df_noise_per_tool["num_error_reports"]
        if print_to_console:
            print("noise_in_cases_where_errors_are_present")
            print(df_noise_per_tool[["noise_ratio", "num_noise_class_line", "num_error_reports"]])
        df_noise_ratio[toolname] = df_noise_per_tool["noise_ratio"]

    df_noise_ratio.to_csv(f'{outpath}/noise.csv')
    df_overall_noise_ratio.to_csv(f'{outpath}/overall_noise_including_unexpected.csv')


def plot_helpfulness(df, outpath, toolname):
    SMALL_SIZE = 16
    MEDIUM_SIZE = 16
    BIGGER_SIZE = 16

    plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
    plt.rc('axes', titlesize=BIGGER_SIZE)  # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

    df_plot = df.groupby("category").sum()
    df_plot.loc["ALL"] = df_plot.sum(axis=0)
    df_plot.drop("other", axis=0, inplace=True)
    df_plot["TP_class"] = df_plot["TP_class"] - df_plot["TP_class_line"]
    df_plot["TP_line"] = df_plot["TP_line"] - df_plot["TP_class_line"]
    df_plot["TP_base"] = df_plot["TP_base"] - df_plot["TP_class_line"] - df_plot["TP_class"] - df_plot["TP_line"]
    colors = ['#88CCEE', '#88CCEE', '#44AA99', '#EE6677']
    # colors = ['#66CCEE', 'yellow', '#228833', '#EE6677']
    fig, ax = plt.subplots(1, 1, figsize=(9, 6))
    df_plot = df_plot[["TP_class", "TP_class_line", "TP_line", "TP_base"]]
    df_plot[["TP_class", "TP_class_line", "TP_line", "TP_base"]].div(df_plot.sum(axis=1), axis=0).plot.barh(
        stacked=True, color=colors, ax=ax, legend=False)
    # Customize bars
    bars = ax.patches
    for i in [4, 5, 6, 7]:
        bars[i].set_hatch("//")
        bars[i].set_edgecolor(colors[2])
    # Create custom legend handles
    handles = [
        mpatches.Patch(color=colors[0], label="correct error class"),
        mpatches.Patch(facecolor=colors[1], edgecolor=colors[2], hatch='//', label='correct class and line'),
        mpatches.Patch(color=colors[2], label='correct source line'),
        mpatches.Patch(color=colors[3], label='not helpful report')
    ]
    ax.legend(handles=handles, ncol=2, loc='center left', bbox_to_anchor=(0.05, -0.3))
    # ax.set_title(f"Helpfulness of {toolname.upper()} Error Reports")
    ax.set_xlabel("Percentage of error reports")
    ax.set_ylabel("MPI feature")
    plt.tight_layout()
    plt.savefig(f'{outpath}/helpfulness_{toolname}_plot.pdf')


def cmd_latex(rootdir, toolnames):
    here = os.getcwd()
    os.chdir(rootdir)
    results = {}
    total_elapsed = {}
    used_toolnames = []

    # select the tools for which we have some results
    print("Produce the stats for:", end='')
    for toolname in toolnames:
        if not toolname in tools:
            raise Exception(f"Tool {toolname} does not seem to be a valid name.")

        if os.path.exists(f'{args.logs_dir}/{toolname}'):
            used_toolnames.append(toolname)
            print(f' {toolname}', end="")

            # To compute timing statistics
            total_elapsed[toolname] = 0
    print(".")

    test_categories = ['COLL', 'P2P', 'RMA', 'other']

    # Initialize the data structure to gather all results
    results = {'total': {}, 'error': {}}
    timing = {'total': {}, 'error': {}}

    for test_category in test_categories:
        results[test_category] = {}
        timing[test_category] = {}

    for error in error_scope:
        results[error] = {}
        timing[error] = {}
        for toolname in used_toolnames:
            results[error][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [], 'TRUE_NEG': [],
                                        'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': []}
            results['total'][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [],
                                          'TRUE_NEG': [], 'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': [], 'error': [],
                                          'OK': []}
            results['error'][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [],
                                          'TRUE_NEG': [], 'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': [], 'error': [],
                                          'OK': []}
            timing[error][toolname] = []
            timing['total'][toolname] = []
            timing['error'][toolname] = []
            for test_category in test_categories:
                results[test_category][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [],
                                                    'TRUE_NEG': [], 'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': [],
                                                    'error': [], 'OK': []}
                timing[test_category][toolname] = []

    # Get all data from the caches
    for test in todo:
        binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
        ID = test['id']
        test_category = test['category']
        test_id = f"{binary}_{ID}"
        expected = test['expect']

        for toolname in used_toolnames:
            (res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname,
                                                                      test_id=test_id, logs_dir=args.logs_dir,
                                                                      expected=expected)
            ##(res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname, test_id=test_id, expected=expected)
            error = possible_details[test['detail']]
            results[error][toolname][res_category].append(test_id)
            results['total'][toolname][res_category].append(test_id)
            results[test_category][toolname][res_category].append(test_id)
            timing[error][toolname].append(float(elapsed))
            timing['total'][toolname].append(float(elapsed))
            timing[test_category][toolname].append(float(elapsed))
            if expected == 'OK':
                results['total'][toolname]['OK'].append(test_id)
                results[test_category][toolname]['OK'].append(test_id)
            else:
                results['total'][toolname]['error'].append(test_id)
                results[test_category][toolname]['error'].append(test_id)
                results['error'][toolname][res_category].append(test_id)
                timing['error'][toolname].append(float(elapsed))

    # Create directory for output if not present
    pathlib.Path(f'{rootdir}/latex/').mkdir(parents=True, exist_ok=True)

    # Produce the results per tool and per category
    with open(f'{rootdir}/latex/results-per-category-landscape.tex', 'w') as outfile:
        outfile.write('\\setlength\\tabcolsep{3pt} % default value: 6pt\n')
        outfile.write("\\begin{tabular}{|l|*{" + str(len(used_toolnames)) + "}{c|c|c|c||}}\n")
        outfile.write("\\cline{2-" + str(len(used_toolnames) * 4 + 1) + "}\n")
        # First title line: Tool names
        outfile.write("  \\multicolumn{1}{c|}{}")
        for t in used_toolnames:
            outfile.write("& \\multicolumn{4}{c||}{" + displayed_name[t] + "}")
        outfile.write("\\\\\n")
        outfile.write("\\cline{2-" + str(len(used_toolnames) * 4 + 1) + "}\n")
        # Second title line: TP&TN&FP&FN per tool
        outfile.write("  \\multicolumn{1}{c|}{}")
        for t in used_toolnames:
            outfile.write(
                "& \\rotatebox{90}{Build error~~} &\\rotatebox{90}{Failure} & \\rotatebox{90}{Incorrect} & \\rotatebox{90}{Correct~~} ")
        outfile.write("\\\\\\hline\n")

        for error in error_scope:
            if error == 'FOK':
                outfile.write("\\hline\n")
            outfile.write(displayed_name[error])
            for toolname in used_toolnames:
                port = len(results[error][toolname]['unimplemented'])
                othr = len(results[error][toolname]['other'])
                fail = len(results[error][toolname]['failure'])
                tout = len(results[error][toolname]['timeout'])
                good = len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                bad = len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])
                outfile.write(f"&{port + othr} & {fail + tout} &{bad}&{good}")
                # results[error][toolname] = {'failure':[], 'timeout':[], 'unimplemented':[], 'other':[], 'TRUE_NEG':[], 'TRUE_POS':[], 'FALSE_NEG':[], 'FALSE_POS':[]}
            outfile.write("\\\\\\hline\n")
        outfile.write("\\hline\n \\textbf{Total}")
        for toolname in used_toolnames:
            port = othr = fail = tout = good = bad = 0
            for error in error_scope:
                port += len(results[error][toolname]['unimplemented'])
                othr += len(results[error][toolname]['other'])
                fail += len(results[error][toolname]['failure'])
                tout += len(results[error][toolname]['timeout'])
                good += len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                bad += len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])
            outfile.write(f"&{port + othr} & {fail + tout} &{bad}&{good}")
        outfile.write("\\\\\\hline\n")

        # Finish the table
        outfile.write("\\end{tabular}\n")
        outfile.write('\\setlength\\tabcolsep{6pt} % Back to default value\n')

    # Produce the results per tool and per category
    with open(f'{rootdir}/latex/results-per-category-portrait.tex', 'w') as outfile:
        outfile.write('\\setlength\\tabcolsep{1.5pt} % default value: 6pt\n')
        # To split the table in two lines, do this: for errors in [['FOK','AInvalidParam','BResLeak','BReqLifecycle','BLocalConcurrency'], ['CMatch','DRace','DMatch','DGlobalConcurrency','EBufferingHazard']]:
        for errors in [
            ['FOK', 'AInvalidParam', 'BResLeak', 'BReqLifecycle', 'BLocalConcurrency', 'CMatch', 'DRace', 'DMatch',
             'DGlobalConcurrency']]:
            outfile.write("\\begin{tabular}{|l@{}|*{" + str(
                len(errors) - 1) + "}{c|c|c|c||} c|c|c|c|}\n")  # last column not in multiplier (len-1 used) to not have || at the end
            outfile.write(f"\\cline{{2-{len(errors) * 4 + 1}}}\n")
            # First title line: error categories
            outfile.write("  \\multicolumn{1}{c|}{}")
            for error in errors:
                sep = '|' if error == errors[-1] else '||'  # Use || as a separator, unless that's the last column
                outfile.write(f"&\\multicolumn{{4}}{{c{sep}}}{{{displayed_name[error].split(' ')[0]}}}")
            outfile.write("\\\\\n  \\multicolumn{1}{c|}{}")
            for error in errors:
                sep = '|' if error == errors[-1] else '||'  # Use || as a separator, unless that's the last column
                outfile.write(f"&\\multicolumn{{4}}{{c{sep}}}{{{displayed_name[error].split(' ')[1]}}}")
            outfile.write(f"\\\\\\cline{{2-{len(errors) * 4 + 1}}}\n")
            outfile.write("\\multicolumn{1}{c|}{}")
            for error in errors:
                outfile.write(
                    "& \\rotatebox{90}{Build error~~} & \\rotatebox{90}{Runtime error} &")  # \\rotatebox{90}{Timeout~~}&
                if error == 'FOK':
                    outfile.write(
                        " \\rotatebox{90}{False \\textbf{Positive}} & \\rotatebox{90}{True \\textbf{Negative}~~} \n")
                else:
                    outfile.write(" \\rotatebox{90}{False Negative} & \\rotatebox{90}{True Positive~} \n")
            outfile.write("\\\\\\hline\n")

            # Find the best tool
            best = {}
            for error in errors:
                best[error] = 0
                for toolname in used_toolnames:
                    val = len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                    if val > best[error]:
                        best[error] = val
                # print(f"Best for {error} has {best[error]}")

            # display all tools
            for toolname in used_toolnames:
                outfile.write(f'{displayed_name[toolname]}')
                for error in errors:
                    port = len(results[error][toolname]['unimplemented'])
                    othr = len(results[error][toolname]['other'])
                    fail = len(results[error][toolname]['failure'])
                    tout = len(results[error][toolname]['timeout'])
                    good = len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                    bad = len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])
                    if good == best[error]:  # Best tool is diplayed in bold
                        outfile.write(f"&{{\\bf {port}}}&{{\\bf {tout + othr + fail}}}&{{\\bf {bad}}}&{{\\bf {good}}}")
                    else:
                        outfile.write(f"&{port}&{tout + othr + fail}&{bad}&{good}")
                outfile.write("\\\\\\hline\n")

            outfile.write("\\hline\\textit{Ideal tool}")
            for error in errors:
                toolname = used_toolnames[0]
                total = len(results[error][toolname]['unimplemented']) + len(results[error][toolname]['other']) + len(
                    results[error][toolname]['failure'])
                total += len(results[error][toolname]['timeout']) + len(results[error][toolname]['TRUE_POS']) + len(
                    results[error][toolname]['TRUE_NEG'])
                total += len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])

                outfile.write(f"& \\textit{{0}} &\\textit{{0}} & \\textit{{0}} & \\textit{total} \n")
            outfile.write("\\\\\\hline\n")

            # Finish the table
            outfile.write("\\end{tabular}\n\n\\medskip\n")
        outfile.write('\\setlength\\tabcolsep{6pt} % Back to default value\n')

    for test_category in ['total'] + test_categories:
        # Produce the landscape results+metric per tool for all category
        with open(f'{rootdir}/latex/results-{test_category}.tex', 'w') as outfile:
            outfile.write('\\setlength\\tabcolsep{2pt} % default value: 6pt\n')
            outfile.write('\\begin{tabular}{|l|*{3}{c|}|*{4}{c|}|*{2}{c|}|*{4}{c|}|c|}\\hline\n')
            outfile.write(
                '  \\multirow{2}{*}{ \\textbf{Tool}} &  \\multicolumn{3}{c||}{Errors} &\\multicolumn{4}{c||}{Results}&\\multicolumn{2}{c||}{Robustness} &\\multicolumn{4}{c||}{Usefulness}&\\textbf{Overall}\\\\\\cline{2-14}\n')
            outfile.write(
                '& \\textbf{CE}&\\textbf{TO}&\\textbf{RE}  & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} &\\textbf{Coverage} & \\textbf{Conclusiveness} & \\textbf{Specificity}&\\textbf{Recall}& \\textbf{Precision}& \\textbf{F1 Score}    & \\textbf{accuracy}\\\\\\hline \n')

            # Search the best values
            best = {'TP': 0, 'TN': 0, 'FP': 999999, 'FN': 9999999, 'coverage': 0, 'completion': 0, 'specificity': 0,
                    'recall': 0, 'precision': 0, 'F1': 0, 'accuracy': 0}
            for toolname in used_toolnames:
                TP = len(results[test_category][toolname]['TRUE_POS'])
                TN = len(results[test_category][toolname]['TRUE_NEG'])
                FN = len(results[test_category][toolname]['FALSE_NEG'])
                FP = len(results[test_category][toolname]['FALSE_POS'])
                if TP > best['TP']:
                    best['TP'] = TP
                if TN > best['TN']:
                    best['TN'] = TN
                if FP < best['FP']:
                    best['FP'] = FP
                if FN < best['FN']:
                    best['FN'] = FN

                port = len(results[test_category][toolname]['unimplemented'])
                fail = len(results[test_category][toolname]['failure'])
                othr = len(results[test_category][toolname]['other'])
                tout = len(results[test_category][toolname]['timeout'])
                total = TP + TN + FP + FN + port + fail + othr + tout
                if (TN + FP) != 0 and TP + FN != 0 and TP + FP != 0:
                    coverage = float(percent(port, total, compl=True, one=True, digits=2))
                    if coverage > best['coverage']:
                        best['coverage'] = coverage
                    completion = float(percent((port + fail + othr + tout), (total), compl=True, one=True, digits=2))
                    if completion > best['completion']:
                        best['completion'] = completion
                    specificity = float(percent(TN, (TN + FP), one=True, digits=2))
                    if specificity > best['specificity']:
                        best['specificity'] = specificity
                    recall = float(percent(TP, (TP + FN), one=True, digits=2))
                    if recall > best['recall']:
                        best['recall'] = recall
                    precision = float(percent(TP, (TP + FP), one=True, digits=2))
                    if precision > best['precision']:
                        best['precision'] = precision

                    # Recompute precision & recall without rounding, to match the value computed when displaying the result
                    precision = TP / (TP + FP)
                    recall = TP / (TP + FN)
                    F1 = percent(2 * precision * recall, (precision + recall), one=True)
                    if F1 > best['F1']:
                        best['F1'] = F1
                    accuracy = percent(TP + TN, (TP + TN + FP + FN + port + fail + othr + tout), one=True)
                    if accuracy > best['accuracy']:
                        best['accuracy'] = accuracy
                else:
                    print(
                        f"WARNING: {toolname} not considered as a best score: TN+FP={TP + FP} TP+FN={TP + FN} TP+FP={TP + FP}")

            for key in best:  # Cleanup the data to ensure that the equality test matches in bold_if()
                if best[key] == 1.0:
                    best[key] = "1"
            print(f"best coverage: {best['coverage']}")
            print(f"best: {best}")

            for toolname in used_toolnames:
                outfile.write(f'{displayed_name[toolname]}&\n')

                port = len(results[test_category][toolname]['unimplemented'])
                fail = len(results[test_category][toolname]['failure'])
                othr = len(results[test_category][toolname]['other'])
                tout = len(results[test_category][toolname]['timeout'])
                TP = len(results[test_category][toolname]['TRUE_POS'])
                TN = len(results[test_category][toolname]['TRUE_NEG'])
                FN = len(results[test_category][toolname]['FALSE_NEG'])
                FP = len(results[test_category][toolname]['FALSE_POS'])

                total = TP + TN + FP + FN + port + fail + othr + tout

                outfile.write(f"{bold_if(port, 0)}&{bold_if(tout, 0)}&{bold_if(fail + othr, 0)}")
                outfile.write(
                    f"&{bold_if(TP, best['TP'])}&{bold_if(TN, best['TN'])}&{bold_if(FP, best['FP'])}&{bold_if(FN, best['FN'])}&")

                # Coverage & Completion
                coverage = percent(port, total, compl=True, one=True, digits=2)
                completion = percent((port + fail + othr + tout), (total), compl=True, one=True, digits=2)
                outfile.write(f"{bold_if(coverage, best['coverage'])} &{bold_if(completion, best['completion'])}&")
                # Specificity: recognized {TN} correct codes out of {TN+FP}
                specificity = percent(TN, (TN + FP), one=True, digits=2)
                outfile.write(f'{bold_if(specificity, best["specificity"])}&')
                # Recall: found {TP} errors out of {TP+FN} ;Precision: {TP} diagnostic of error are correct out of {TP+FP}) ;
                recall = percent(TP, (TP + FN), one=True, digits=2)
                precision = percent(TP, (TP + FP), one=True, digits=2)
                outfile.write(f'{bold_if(recall, best["recall"])} & {bold_if(precision, best["precision"])} &')
                # F1 Score
                if TP + FP > 0 and TP + FN > 0:
                    precision = TP / (TP + FP)
                    recall = TP / (TP + FN)
                    F1 = percent(2 * precision * recall, (precision + recall), one=True, digits=2)
                    outfile.write(f'{bold_if(F1, best["F1"])}&')
                else:
                    outfile.write('(error)&')
                # Accuracy: {TP+TN} correct diagnostics in total, out of all tests {TP+TN+FP+FN+port+fail+othr+tout} diagnostics
                accuracy = percent(TP + TN, (TP + TN + FP + FN + port + fail + othr + tout), one=True, digits=2)
                outfile.write(f'{bold_if(accuracy, best["accuracy"])}')

                outfile.write(f'\\\\\\hline\n')
            outfile.write(f'\\hline\n')

            outfile.write('\\textit{Ideal tool}&\\textit{0}&\\textit{0}&\\textit{0}&')
            outfile.write(
                f"\\textit{{{len(results['total'][toolname]['error'])}}}&\\textit{{{len(results['total'][toolname]['OK'])}}}&\\textit{{0}}&\\textit{{0}}&")
            outfile.write(
                "\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1} \\\\\\hline\n")

            outfile.write('\\end{tabular}\n')
            outfile.write('\\setlength\\tabcolsep{6pt} % Back to default value\n')

    # Produce the table with the metrics per tool per category (not used, as we put everything on one line only)
    with open(f'{rootdir}/latex/results-metrics.tex', 'w') as outfile:
        outfile.write('\\begin{tabular}{|l|*{7}{c|}}\\hline\n')
        outfile.write(
            '  \\multirow{2}{*}{ \\textbf{Tool}} &  \\multicolumn{2}{c|}{Robustness} &\\multicolumn{4}{c|}{Usefulness}&\\textbf{Overall}\\\\\\cline{2-7}\n')

        outfile.write(
            '  &  \\textbf{Coverage} & \\textbf{Conclusiveness} & \\textbf{Specificity}&\\textbf{Recall}& \\textbf{Precision}& \\textbf{F1 Score}    & \\textbf{accuracy}   \\\\\\hline \n')

        for toolname in used_toolnames:
            outfile.write(f'{displayed_name[toolname]}&\n')

            nPort = len(results['total'][toolname]['unimplemented'])
            nFail = len(results['total'][toolname]['failure']) + len(results['total'][toolname]['other'])
            nTout = len(results['total'][toolname]['timeout'])
            TP = len(results['total'][toolname]['TRUE_POS'])
            TN = len(results['total'][toolname]['TRUE_NEG'])
            FN = len(results['total'][toolname]['FALSE_NEG'])
            FP = len(results['total'][toolname]['FALSE_POS'])

            total = TP + TN + FP + FN + nTout + nPort + nFail

            # Coverage & Completion
            outfile.write(
                f'{percent(nPort, total, compl=True, one=True)} &{percent((nTout + nFail + nPort), (total), compl=True, one=True)}&')
            # Specificity: recognized {TN} correct codes out of {TN+FP}
            outfile.write(f'{percent(TN, (TN + FP), one=True)}&')
            # Recall: found {TP} errors out of {TP+FN} ;Precision: {TP} diagnostic of error are correct out of {TP+FP}) ;
            outfile.write(f'{percent(TP, (TP + FN), one=True)} & {percent(TP, (TP + FP), one=True)} &')
            # F1 Score
            if TP + FP > 0 and TP + FN > 0:
                precision = TN / (TP + FP)
                recall = TP / (TP + FN)
                outfile.write(f'{percent(2 * precision * recall, (precision + recall), one=True)}&')
            else:
                outfile.write('(error)&')
            # Accuracy: {TP+TN} correct diagnostics in total, out of all tests {TP+TN+FP+FN+nTout+nFail+nPort} diagnostics
            outfile.write(f'{percent(TP + TN, (TP + TN + FP + FN + nTout + nFail + nPort), one=True)}')
            outfile.write(f'\\\\\\hline\n')

        outfile.write(
            "\\hline\n\\textit{Ideal tool}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}\\\\\\hline\n")

        outfile.write('\\end{tabular}\n')

    # Produce the timing results
    with open(f'{rootdir}/latex/results-timings.tex', 'w') as outfile:
        outfile.write(f"\\begin{{tabular}}{{|c|c|*{{{len(used_toolnames)}}}{{c|}}}}\n")
        outfile.write(f"\\cline{{3-{len(used_toolnames) + 2}}}\n")
        # First title line: Tool names
        outfile.write("  \\multicolumn{2}{c|}{}")
        for t in used_toolnames:
            outfile.write(f"& {displayed_name[t]}")
        outfile.write(f"\\\\\\hline\n")

        def show_line(key, display_name):
            outfile.write(f"\\multirow{{3}}{{*}}{{{display_name}}} & Mean time ")
            for toolname in used_toolnames:
                if len(timing[key][toolname]) > 1:
                    mean = statistics.mean(timing[key][toolname])
                    outfile.write(f"&{round(mean, 2)}")
                else:
                    outfile.write("&(error)")
                    print(f"Error while computing the mean of timing[{key}][{toolname}] (needs at least one value)")
            outfile.write(f"\\\\\\cline{{2-{len(used_toolnames) + 2}}}\n")

            outfile.write(f"& StdDev ")
            for toolname in used_toolnames:
                if len(timing[key][toolname]) > 2:
                    stdev = statistics.stdev(timing[key][toolname])
                    outfile.write(f"&{round(stdev, 2)}")
                else:
                    outfile.write("&(error)")
                    print(
                        f"Error while computing the variance of timing[{key}][{toolname}] (needs at least two values)")
            outfile.write(f"\\\\\\cline{{2-{len(used_toolnames) + 2}}}\n")

            outfile.write(f" & \\# timout ")
            for toolname in used_toolnames:
                tout = len(results[key][toolname]['timeout'])
                if tout == 0:
                    tout = '-'
                outfile.write(f"&{tout}")
            outfile.write("\\\\\\hline\n")

        for error in error_scope:
            if error == 'FOK':
                outfile.write('\\hline\n')
                show_line('error', '\\textit{All incorrect tests}')
                title = '\\textit{All correct tests}'
            else:
                title = f"\\makecell{{{displayed_name[error]} \\\\ ({error_scope[error]})}}"

            show_line(error, title)
        outfile.write('\\hline\n')
        show_line('total', '\\textbf{All tests}')

        outfile.write(f"\\multicolumn{{2}}{{|c|}}{{\\textbf{{Total time}}}} ")
        for toolname in used_toolnames:
            secs = sum(timing['total'][toolname])
            days = int(secs // 86400)
            hours = int((secs - days * 86400) // 3600)
            minutes = int((secs - days * 86400 - hours * 3600) // 60)
            seconds = secs - days * 86400 - hours * 3600 - minutes * 60
            #            centi = int((seconds - int(seconds)*10)
            outfile.write("&")
            if hours > 0:
                outfile.write(f"{hours}h")
            if hours > 0 or minutes > 0:
                outfile.write(f"{minutes}m")
            outfile.write(f"{int(seconds)}s")
        outfile.write(f"\\\\\\hline\n")

        # Last line: Tool names again
        outfile.write("  \\multicolumn{2}{c|}{}")
        for t in used_toolnames:
            outfile.write(f"& {displayed_name[t]}")
        outfile.write(f"\\\\\\cline{{3-{len(used_toolnames) + 2}}}\n")

        outfile.write(f"\\end{{tabular}}\n")

    with open(f'{rootdir}/latex/files-count.tex', 'w') as outfile:
        files_results = categorize_all_files(tools[used_toolnames[0]], used_toolnames[0], todo, args.logs_dir)

        error_types = {}
        error_types_tests = {}
        for error in error_scope:
            error_types[error] = 0
            error_types_tests[error] = 0

        # Count number of code by expected type of results
        for f in files_results:
            error_types[possible_details[files_results[f]['detail']]] += 1

        for t in todo:
            error_types_tests[possible_details[t['detail']]] += 1

        outfile.write("\\begin{tabular}{|l|c|c|}\n")
        outfile.write("  \\hline\n")
        outfile.write("  \\textbf{Error category} & \\textbf{Number of codes} & \\textbf{Number of tests}\\\\\n")
        outfile.write("  \\hline\n")
        for et in error_types:
            if et in ['BLocalConcurrency', 'DRace', 'DGlobalConcurrency',
                      'EBufferingHazard', 'InputHazard']:
                outfile.write(
                    f"  \\textbf{{{displayed_name[et]}}} & \\textbf{{{error_types[et]}}}& \\textbf{{{error_types_tests[et]}}} \\\\\n")
            else:
                outfile.write(f"  \\textit{{{displayed_name[et]}}} & {error_types[et]} & {error_types_tests[et]}\\\\\n")

        outfile.write("  \\hline\n")
        outfile.write(f"  \\textbf{{Total}} & {len(files_results)} & {len(todo)}\\\\\n")
        outfile.write("  \\hline\n")
        outfile.write("\\end{tabular}\n")

    def resultsPerCategory(suffix, hazard=False):
        category = ['FOK', 'AInvalidParam', 'BResLeak', 'DMatch', 'CMatch', 'BReqLifecycle', 'BEpochLifecycle']
        if hazard:
            category = ['BLocalConcurrency', 'DGlobalConcurrency', 'DRace', 'EBufferingHazard', 'InputHazard']

        with open(f'{rootdir}/latex/nd-results-per-category-portrait-{suffix}.tex', 'w') as outfile:
            # files_results = categorize_all_files(tools[used_toolnames[0]], used_toolnames[0], todo)
            ext_results = {}
            best = {}

            # Put FOK at the first position
            last = ''
            for e in category:
                last = e
                best[e] = {
                    'TP': 0, 'TN': 0, 'FP': 99999, 'FN': 99999,
                    'SE': 99999,
                    'accp': 0, 'accm': 0
                }

            for toolname in used_toolnames:
                ext_results[toolname] = {}

                files_results = categorize_all_files(tools[toolname], toolname, todo, args.logs_dir)
                for error in category:
                    ext_results[toolname][error] = {
                        'TP': [], 'TN': [], 'FP': [], 'FN': [],
                        'CE': [], 'TO': [], 'RE': [], 'O': [], 'SE': [],
                        'accp': 0, 'accm': 0,
                        'total': 0
                    }

                    for f in files_results:
                        if possible_details[files_results[f]['detail']] == error:
                            ext_results[toolname][error][files_results[f]['result']].append(f)
                            ext_results[toolname][error]['total'] += 1

                    total = ext_results[toolname][error]['total']
                    # accp = round((len(ext_results[toolname][error]['TP']) + len(ext_results[toolname][error]['TN']) + len(ext_results[toolname][error]['TP'])) / total, 2)
                    # accm = round((len(ext_results[toolname][error]['TP']) + len(ext_results[toolname][error]['TN'])) / total, 2)

                    ext_results[toolname][error]['accp'] = 0  # accp
                    ext_results[toolname][error]['accm'] = 0  # accm

                for error in category:
                    err = (len(ext_results[toolname][error]['CE'])
                           + len(ext_results[toolname][error]['TO'])
                           + len(ext_results[toolname][error]['RE'])
                           + len(ext_results[toolname][error]['O'])
                           + len(ext_results[toolname][error]['SE']))

                    if best[error]['SE'] > err:
                        best[error]['SE'] = err

                    for res in ['FP', 'FN']:
                        if best[error][res] > len(ext_results[toolname][error][res]):
                            best[error][res] = len(ext_results[toolname][error][res])

                    for res in ['TP', 'TN']:
                        if best[error][res] < len(ext_results[toolname][error][res]):
                            best[error][res] = len(ext_results[toolname][error][res])

                    for res in ['accp', 'accm']:
                        if best[error][res] < ext_results[toolname][error][res]:
                            best[error][res] = ext_results[toolname][error][res]

            ncol = 4 if not hazard else 6
            align = 'c|c|c|c|' if not hazard else 'c|c|c|c|c|c|'

            outfile.write("\\setlength\\tabcolsep{1.5pt}\n")
            outfile.write(f"\\begin{{tabular}}{{|l|*{{{len(category) - 1}}}{{ {align} |}} {align}}}\n")
            outfile.write(f"\\cline{{2- {(len(category) * ncol) + 1} }}\n")

            outfile.write("  \\multicolumn{1}{c|}{}")
            for error in category:
                if error == last:
                    outfile.write(f" & \\multicolumn{{{ncol}}}{{c|}}")
                else:
                    outfile.write(f" & \\multicolumn{{{ncol}}}{{c||}}")

                outfile.write(f"{{\\it {displayed_name[error].split()[0]}}}")

            outfile.write("\\\\\n")

            outfile.write("  \\multicolumn{1}{c|}{}")
            for error in category:
                if error == last:
                    outfile.write(f" & \\multicolumn{{{ncol}}}{{c|}}")
                else:
                    outfile.write(f" & \\multicolumn{{{ncol}}}{{c||}}")

                outfile.write(f"{{\\it {displayed_name[error].split()[1]}}}")

            outfile.write("\\\\\n")
            outfile.write(f"\\cline{{2- {(len(category) * ncol) + 1} }}\n")

            outfile.write("  \\multicolumn{1}{c|}{}")
            for error in category:
                outfile.write(" & \\rotatebox{90}{SE}")
                if error == "FOK":
                    outfile.write(" & \\rotatebox{90}{{\\bf TN}}")
                    if hazard:
                        outfile.write(" & \\rotatebox{90}{{\\bf FP}}")
                    outfile.write(" & \\rotatebox{90}{{\\bf FP}}")
                else:
                    outfile.write(" & \\rotatebox{90}{TP}")
                    if hazard:
                        outfile.write(" & \\rotatebox{90}{TP}")
                    outfile.write(" & \\rotatebox{90}{FN}")

                if hazard:
                    outfile.write(" & \\rotatebox{90}{Accuracy\\textsuperscript{+}}")
                    outfile.write(" & \\rotatebox{90}{Accuracy\\textsuperscript{-}}")
                else:
                    outfile.write(" & \\rotatebox{90}{Accuracy}")

            outfile.write("\\\\\\hline\n")

            for toolname in used_toolnames:
                outfile.write(f"{displayed_name[toolname]}")

                for error in category:
                    disp_err = (len(ext_results[toolname][error]['CE'])
                                + len(ext_results[toolname][error]['TO'])
                                + len(ext_results[toolname][error]['RE'])
                                + len(ext_results[toolname][error]['O'])
                                + len(ext_results[toolname][error]['SE']))

                    if disp_err == best[error]['SE']:
                        outfile.write(f"& {{\\bf {disp_err}}}")
                    else:
                        outfile.write(f"& {disp_err}")

                    format_if_best = lambda res: f" & {{\\bf {len(ext_results[toolname][error][res])}}}" if best[error][
                                                                                                                res] == len(
                        ext_results[toolname][error][res]) else f" & {len(ext_results[toolname][error][res])}"

                    format_if_best_2 = lambda \
                            res: f" & {{\\bf {1 if ext_results[toolname][error][res] >= 1.0 else 0 if ext_results[toolname][error][res] <= 0.0 else ext_results[toolname][error][res]} }}" if \
                        best[error][res] == ext_results[toolname][error][
                            res] else f" & {1 if ext_results[toolname][error][res] >= 1.0 else 0 if ext_results[toolname][error][res] <= 0.0 else ext_results[toolname][error][res]}"

                    if error == "FOK":
                        outfile.write(format_if_best('TN'))
                        if hazard:
                            outfile.write(format_if_best('FP'))
                        outfile.write(format_if_best('FP'))
                    else:
                        outfile.write(format_if_best('TP'))
                        if hazard:
                            outfile.write(format_if_best('TP'))
                        outfile.write(format_if_best('FN'))

                    if hazard:
                        outfile.write(format_if_best_2('accp'))
                        outfile.write(format_if_best_2('accm'))
                    else:
                        outfile.write(format_if_best_2('accp'))

                outfile.write("\\\\\\hline\n")

            outfile.write("\\textit{Ideal tool}")

            for error in category:
                outfile.write(" & \\textit{0}")
                outfile.write(f" & \\textit{{ {ext_results[toolname][error]['total']} }}")
                if hazard:
                    outfile.write(" & \\textit{0}")
                outfile.write(" & \\textit{0}")
                outfile.write(" & \\textit{1}")
                if hazard:
                    outfile.write(" & \\textit{1}")

            outfile.write("\\\\\\hline\n")

            outfile.write("\\end{tabular}\n")
            outfile.write("\\setlength\\tabcolsep{6pt}")

    resultsPerCategory('deter', hazard=False)
    resultsPerCategory('ndeter', hazard=True)

    with open(f'{rootdir}/latex/reclassified-result.tex', 'w') as outfile:
        reclassified = {}

        category = ['FOK']
        last = ''
        for e in error_scope:
            if e != 'FOK':
                category.append(e)
                last = e
        category.append('total')

        for toolname in used_toolnames:
            reclassified[toolname] = {}

            for e in category:
                reclassified[toolname][e] = []

            for test in todo:
                binary = re.sub('\\.c', '', os.path.basename(test['filename']))
                ID = test['id']
                test_id = f"{binary}_{ID}"

                (res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname,
                                                                          test_id=test_id, logs_dir=args.logs_dir,
                                                                          expected=test['expect'], autoclean=False)

                if not tools[toolname].is_correct_diagnostic(test_id, res_category, test['expect'], test['detail']):
                    reclassified[toolname][possible_details[test['detail']]].append(test_id)
                    reclassified[toolname]['total'].append(test_id)

        outfile.write("\\begin{tabular}{|l|")
        for e in category:
            outfile.write("c|")
        outfile.write("}\n")
        outfile.write("  \\hline\n")

        # Column title
        outfile.write("  ")
        for e in category:
            if e != 'total':
                outfile.write(f" &\\textit{{ {displayed_name[e].split()[0]} }}")
            else:
                outfile.write(" & ")

        outfile.write("  \\\\\n")

        outfile.write("  \\textbf{Tools}")
        for e in category:
            if e != 'total':
                outfile.write(f" &\\textit{{ {displayed_name[e].split()[1]} }}")
            else:
                outfile.write(" & \\textbf{Total}")

        outfile.write("\\\\\n")
        outfile.write("  \\hline\n")

        # Results
        for toolname in used_toolnames:
            outfile.write(f"  {displayed_name[toolname]}")
            for e in category:
                res = len(reclassified[toolname][e])
                if res > 0:
                    outfile.write(f" & \\textbf{{ {res} }}")
                else:
                    outfile.write(f" & {res}")

            outfile.write("  \\\\\\hline\n")

        outfile.write("\\end{tabular}\n")

    files = get_C_files_from_dir(f"{rootdir}/scripts/gencodes/")

    generate_errors(files, f"{rootdir}/latex/errors.tex")
    generate_labels(files, f"{rootdir}/latex/labels.tex")
    generate_features(files, f"{rootdir}/latex/features.tex")

    os.chdir(here)


def get_overview_plot(data, outpath,scrutiny="base"):
    assert len(data) > 0
    # get the column names = all the metrics tht are calculated
    df_first_tool=    next(iter(data.values()))
    df_first_tool["TP"] = df_first_tool[f"TP_{scrutiny}"]
    df_first_tool = classify_tests(df_first_tool)
    df_first_tool = aggregate_metrics_per_category(df_first_tool)
    cols = df_first_tool.columns

    df_coll = pd.DataFrame(columns=cols)
    df_rma= pd.DataFrame(columns=cols)
    df_p2p = pd.DataFrame(columns=cols)
    df_total = pd.DataFrame(columns=cols)

    for toolname, df in data.items():
        df["TP"] = df[f"TP_{scrutiny}"]
        df = classify_tests(df)
        df = aggregate_metrics_per_category(df)
        df_coll.loc[toolname] = df.loc["COLL"]
        df_rma.loc[toolname] = df.loc["RMA"]
        df_p2p.loc[toolname] = df.loc["P2P"]
        df_total.loc[toolname] = df.loc["ALL"]


    SMALL_SIZE = 20
    MEDIUM_SIZE = 22
    BIGGER_SIZE = 24

    plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
    plt.rc('axes', labelsize=SMALL_SIZE)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 9))  #

    # colors = ['#228833', '#66ccee', '#ee6677', '#aa3377', '#ccbb44', '#bbbbbb']
    colors = ['#6699CC', '#EECC66', '#004488', '#997700', '#BBBBBB', '#000000']

    ((ax1, ax2), (ax3, ax4)) = axs
    df_p2p[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax1, legend=False, color=colors)
    ax1.set_title('P2P')
    handles, labels = ax1.get_legend_handles_labels()

    df_coll[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax2, legend=False, color=colors)
    ax2.set_title('Collective')
    ax2.yaxis.tick_right()
    # Set the y-axis labels to uppercase
    ax2.set_yticklabels([label.get_text().upper() for label in ax2.get_yticklabels()])

    df_rma[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax3, legend=False, color=colors)
    ax3.set_title('RMA')

    df_total[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax4, legend=False, color=colors)
    ax4.set_title('Total')
    ax4.yaxis.tick_right()

    for ax in [ax1, ax2, ax3, ax4]:
        ax.set_ylabel('')
        # Set the y-axis labels to uppercase
        ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()])

    fig.legend(handles, labels, loc='upper center', ncols=6, bbox_to_anchor=(0.5, 1.05), )

    plt.tight_layout()

    plt.savefig(os.path.join(outpath, "overview_per_cat.pdf"), bbox_inches="tight")


########################
# cmd_plots(): what to do when '-c plots' is used (extract the statistics of this tool)
########################

def cmd_plots(rootdir, toolnames, ext="pdf"):
    here = os.getcwd()
    os.chdir(rootdir)
    os.makedirs('plots', exist_ok=True)
    outpath = f'{rootdir}/plots/'

    collected_data = {}
    for toolname in toolnames:
        df = read_tool_reports(rootdir, toolname)
        plot_helpfulness(df, outpath, toolname)
        collected_data[toolname]=df

    get_overview_plot(collected_data,outpath)

    os.chdir(here)


########################
# Main script argument parsing
########################

parser = argparse.ArgumentParser(
    description='This runner intends to provide a bridge from a MPI compiler/executor + a test written with MPI bugs collection header and the actual result compared to the expected.')

parser.add_argument('-c', metavar='cmd', default='all',
                    help="The command you want to execute. By default, 'all', runs all commands in sequence. Other choices:\n"
                         "  generate: redo all the test codes.\n"
                         "  latex: Produce the LaTeX tables we need for the article, using the cached values from a previous 'run'.\n"
                         "  csv: Produce CSV output.\n"
                         "  run: run the tests on all codes.\n"
                         "  html: produce the HTML statistics, using the cached values from a previous 'run'.\n"
                         "  plots: produce the plots images, using the cached values from a previous 'run'.\n")

parser.add_argument('-x', metavar='tool', default='mpirun',
                    help='the tool you want at execution: one among [aislinn, civl, isp, mpisv, must, simgrid, parcoach]')

parser.add_argument('-t', '--timeout', metavar='int', default=300, type=int,
                    help='timeout value at execution time, given in seconds (default: %(default)s)')

parser.add_argument('-n', '--nworkers', metavar='int', default=1, type=int,
                    help='size of the pool of workers that execute the tests in parallel (default: 1)')

parser.add_argument('-l', '--logs-dir', metavar='path', default="logs", type=pathlib.Path,
                    help='path to output directory of logs (default: $PWD/logs)')

parser.add_argument('-g', '--gencodes', metavar='path', default="gencodes", type=pathlib.Path,
                    help='path to directory of source files (default: gencodes)')

parser.add_argument('-lev', '--level', metavar='int', default=2, type=int,
                    help='Generation level to generate codes (default: 2)')

parser.add_argument('-b', metavar='batch', default='1/1',
                    help="Limits the test executions to the batch #N out of M batches (Syntax: 'N/M'). To get 3 runners, use 1/3 2/3 3/3")

parser.add_argument('-f', metavar='format', default='pdf',
                    help="Format of output images [pdf, svg, png, ...] (only for 'plots' command)")

parser.add_argument('-v', '--verbose', action="store_const", dest="loglevel", const=logging.DEBUG, default=logging.INFO)

args = parser.parse_args()
rootdir = os.path.dirname(os.path.abspath(__file__))

# Parameter checking: Did we get a valid tool to use?
arg_tools = []
if args.c == 'all' or args.c == 'run':
    if args.x == 'mpirun':
        raise Exception(
            "No tool was provided, please retry with -x parameter. (see -h for further information on usage)")
    elif args.x in tools:
        arg_tools = [args.x]
    elif ',' in args.x:
        for x in args.x.split(','):
            if x not in tools:
                raise Exception(f"The tool parameter you provided ({x}) is either incorect or not yet implemented.")
            arg_tools.append(x)
    else:
        raise Exception(f"The tool parameter you provided ({args.x}) is either incorect or not yet implemented.")
elif ',' in args.x:
    for x in args.x.split(','):
        if x not in tools:
            raise Exception(f"The tool parameter you provided ({x}) is either incorect or not yet implemented.")
    arg_tools = args.x.split(',')
else:
    arg_tools = [args.x]

print(f'arg_tools: {arg_tools}')

if args.c == 'all':
    extract_all_todo(args.b)
    cmd_run(rootdir=rootdir, toolname=args.x, batchinfo=args.b)
    cmd_html(rootdir, toolnames=arg_tools)
elif args.c == 'generate':
    if args.level:
        cmd_gencodes(level=args.level)
    else:
        cmd_gencodes(level=2)
elif args.c == 'build':
    for t in arg_tools:
        cmd_build(rootdir=rootdir, toolname=t)
elif args.c == 'run':
    extract_all_todo(args.b)
    for t in arg_tools:
        cmd_run(rootdir=rootdir, toolname=t, batchinfo=args.b)
elif args.c == 'latex':
    extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
    # 'smpi','smpivg' are not shown in the paper
    # cmd_latex(rootdir, toolnames=['aislinn', 'civl', 'isp','itac', 'simgrid', 'mpisv', 'must', 'hermes', 'parcoach', 'mpi-checker'])
    cmd_latex(rootdir, toolnames=['itac', 'must', 'parcoach'])
elif args.c == 'csv':
    extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
    if arg_tools:
        cmd_csv(rootdir, toolnames=arg_tools)
    else:
        cmd_csv(rootdir, toolnames=['itac', 'must', 'parcoach'])
elif args.c == 'html':
    extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
    if args.x == 'mpirun':
        # toolnames=['itac', 'simgrid','must', 'smpi', 'smpivg', 'aislinn', 'civl', 'isp', 'mpisv', 'parcoach', 'hermes', 'mpi-checker']
        toolnames = ['itac', 'must', 'parcoach']
    else:
        toolnames = arg_tools
    # Build SVG plots
    # if plots_loaded:
    #     cmd_plots(rootdir, toolnames=toolnames, ext="svg")
    # Build HTML page
    cmd_html(rootdir, toolnames=toolnames)
elif args.c == 'plots':
    if not plots_loaded:
        print("[MBB] Error: Dependancies ('numpy' or 'matplotlib') are not available!")
        exit(-1)
    extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
    if args.x == 'mpirun':
        # toolnames=['itac', 'simgrid', 'must', 'aislinn', 'civl', 'isp', 'mpisv', 'parcoach', 'hermes', 'mpi-checker']
        toolnames = ['itac', 'must', 'parcoach']
    else:
        toolnames = arg_tools

    cmd_plots(rootdir, toolnames=toolnames, ext=args.f)
else:
    print(
        # f"Invalid command '{args.c}'. Please choose one of 'all', 'generate', 'build', 'run', 'html' 'latex' or 'plots'")
        # We should remove latex and plots and update generate
        f"Invalid command '{args.c}'. Please choose one of 'all', 'generate', 'build', 'run', 'csv' 'latex' or 'plots'")
    sys.exit(1)