Skip to content
Snippets Groups Projects
Select Git revision
  • 1c7f79af5c9292c34b7ac463c690ec6e18e8d7b1
  • master default protected
  • develop protected
  • feature/interaction_cmp_setup_improvement
  • 4.22.1
5 results

VRWidgetInteractionComponent.cpp

Blame
  • MBB.py 80.38 KiB
    #! /usr/bin/python3
    
    # autopep8 -i --max-line-length 130 MBB.py
    
    import shutil
    import os
    import signal
    import sys
    import stat
    import re
    import argparse
    import time
    import glob
    import subprocess
    import statistics
    import multiprocessing as mp
    import pathlib
    import logging
    import pandas
    import pandas as pd
    
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    
    from scripts.MBButils import categorize
    
    mpl.rcParams['hatch.linewidth'] = 4.5  # hatch linewidth
    
    # Add our lib directory to the PYTHONPATH, and load our utilitary libraries
    sys.path.append(f'{os.path.dirname(os.path.abspath(__file__))}/scripts')
    
    from MBButils import *
    from LaTeXutils import *
    
    # Plots need big dependancy like numpy and matplotlib, so just ignore
    # the import if dependencies are not available.
    plots_loaded = False
    try:
        from tools.gen_plots_radar import *
    
        plots_loaded = True
    except ImportError:
        print("[MBB] Warning: ImportError for the plots module.")
    
    import tools.parcoach
    import tools.simgrid
    import tools.smpi  # SimGrid without MC
    import tools.smpivg  # SimGrid with valgrind instead of MC
    import tools.must
    import tools.mpisv
    import tools.hermes
    import tools.isp
    
    itac_loaded = False
    try:
        import tools.itac
    
        itac_loaded = True
    except ImportError:
        print(
            "[MBB] Warning: ITAC module cannot be loaded because of an ImportError (that's OK if you did not plan to use it).")
    import tools.civl
    import tools.aislinn
    import tools.mpi_checker
    
    tools = {'aislinn': tools.aislinn.Tool(), 'civl': tools.civl.Tool(), 'hermes': tools.hermes.Tool(),
             'isp': tools.isp.Tool(), 'mpisv': tools.mpisv.Tool(),
             'itac': tools.itac.Tool() if itac_loaded else None,
             'must': tools.must.V18(),  # 'must17': tools.must.V17(), # This one is deprecated, and no RC release right now
             'simgrid': tools.simgrid.Tool(), 'simgrid-3.27': tools.simgrid.v3_27(), 'simgrid-3.28': tools.simgrid.v3_28(),
             'simgrid-3.29': tools.simgrid.v3_29(), 'simgrid-3.30': tools.simgrid.v3_30(),
             'simgrid-3.31': tools.simgrid.v3_31(), 'simgrid-3.32': tools.simgrid.v3_32(),
             'smpi': tools.smpi.Tool(), 'smpivg': tools.smpivg.Tool(), 'parcoach': tools.parcoach.Tool(),
             'mpi-checker': tools.mpi_checker.Tool()}
    
    # Some scripts may fail if error messages get translated
    os.environ["LC_ALL"] = "C"
    
    # BufferLength/BufferOverlap
    # RMA concurrency errors (local and distributed)
    
    ########################
    # Extract the TODOs from the codes
    ########################
    todo = []
    
    
    def extract_all_todo(batch):
        """Extract the TODOs from all existing files, applying the batching request"""
        if os.path.exists(f"/MBB/scripts/{args.gencodes}/"):  # Docker run
            filenames = glob.glob(f"/MBB/scripts/{args.gencodes}/**/*.c")
        elif os.path.exists(f"{args.gencodes}/"):  # Gitlab-ci run
            filenames = glob.glob(f"{os.getcwd()}/{args.gencodes}/*.c")  # our code expects absolute paths
        elif os.path.exists(f"../../{args.gencodes}/"):  # Local runs
            filenames = glob.glob(f"{os.getcwd()}/../../{args.gencodes}/*.c")  # our code expects absolute paths
        else:
            subprocess.run("ls ../..", shell=True)
            raise Exception(
                f"Cannot find the input codes (cwd: {os.getcwd()}). Did you run the original_MBI_generators before running the tests?")
        # Choose the files that will be used by this runner, depending on the -b argument
        match = re.match(r'(\d+)/(\d+)', batch)
        if not match:
            print(
                f"The parameter to batch option ({batch}) is invalid. Must be something like 'N/M', with N and M numbers.")
        pos = int(match.group(1))
        runner_count = int(match.group(2))
        assert pos > 0
        assert pos <= runner_count
        batch = int(len(filenames) / runner_count) + 1
        min_rank = batch * (pos - 1)
        max_rank = (batch * pos) - 1
        print(f'Handling files from #{min_rank} to #{max_rank}, out of {len(filenames)} in {os.getcwd()}')
    
        global todo
        filename = sorted(filenames)
        for filename in filenames[min_rank:max_rank]:
            todo = todo + parse_one_code(filename)
        if pos == runner_count and pos != 1:  # The last runner starts from the end of the array to ease dynamically splitting
            todo = list(reversed(todo))
    
    
    def extract_all_todo_from_logdir(tool, logdir):
        """Extract the TODOs from the given logdir"""
        if os.path.exists(logdir):
            filenames = glob.glob(f"{logdir}/{tool}/*.c")
        else:
            raise Exception(
                f"Cannot find the input codes ({logdir}). Did you run the original_MBI_generators before running the tests?")
    
        global todo
        filename = sorted(filenames)
        for filename in filenames:
            todo = todo + parse_one_code(filename)
        print(todo)
    
    
    ########################
    # cmd_gencodes(): what to do when '-c generate -lev <1|2|>' is used (Generating the codes)
    ########################
    
    def cmd_gencodes(level):
        here = os.getcwd()
        if os.path.exists("/MBB/scripts/errors/"):  
            #level = 2 
            print(f"Generate level {level}")
            subprocess.run(f'tar -xf real_world_data.csv.tar.gz', shell=True, check=True)
            subprocess.run(f'cd scripts && python3 generate.py --generator_dir errors --level {level} --real_world_data ../output.csv --remove_previous_generation_results', shell=True, check=True)
        else:
            raise Exception("Cannot find the generators. Please report that bug.")
        os.chdir(here)
    
    
    
       # if os.path.exists("/MBI/scripts/original_MBI_generators/CollArgGenerator.py"):  # Docker run
       #     print("Docker run")
       #     generators = glob.glob("/MBI/scripts/original_MBI_generators/*Generator.py")
       #     dir = "/MBI/gencodes"
       # elif os.path.exists("../../scripts/original_MBI_generators/CollArgGenerator.py"):  # Local run, from logs dir
       #     print("Local run, from tools' logs dir")
       #     generators = glob.glob(f"{os.getcwd()}/../../scripts/original_MBI_generators/*Generator.py")
       #     dir = "../../gencodes/"
       # elif os.path.exists("scripts/original_MBI_generators/CollArgGenerator.py"):  # Local run, from main dir
       #     print("Local run, from MBI main dir")
       #     generators = glob.glob(f"{os.getcwd()}/scripts/original_MBI_generators/*Generator.py")
       #     dir = "gencodes/"
       # else:
       #     raise Exception("Cannot find the codes' original_MBI_generators. Please report that bug.")
       # subprocess.run(f"rm -rf {dir} ; mkdir {dir}", shell=True, check=True)
       # here = os.getcwd()
       # os.chdir(dir)
       # print(f"Generate the codes (in {os.getcwd()}): ", end='')
       # for generator in generators:
       #     m = re.match("^.*?/([^/]*)Generator.py$", generator)
       #     if m:
       #         print(m.group(1), end=", ")
       #     else:
       #         print(generator, end=", ")
       #     subprocess.run(f'../scripts/ensure_python3 {generator}', shell=True, check=True)
       # print("\nTest count: ", end='')
       # sys.stdout.flush()
       # subprocess.run("ls *.c|wc -l", shell=True, check=True)
       # subprocess.run("for n in *.c ; do cat -n $n > $n.txt ; done", shell=True, check=True)
       # os.chdir(here)
    
    
    ########################
    # cmd_build(): what to do when '-c build' is used (building the tool, discarding the cache)
    ########################
    def cmd_build(rootdir, toolname):
        # Basic verification
        tools[toolname].ensure_image()
    
        # Build the tool on need
        tools[toolname].build(rootdir=rootdir, cached=False)
    
    
    ########################
    # cmd_run(): what to do when '-c run' is used (running the tests)
    ########################
    def cmd_run(rootdir, toolname, batchinfo):
        # Go to the tools' logs directory on need
        rootdir = os.path.dirname(os.path.abspath(__file__))
        os.makedirs(f'{rootdir}/{args.logs_dir}/{toolname}', exist_ok=True)
        os.chdir(f'{rootdir}/{args.logs_dir}/{toolname}')
        print(f"Run tool {toolname} from {os.getcwd()} (batch {batchinfo}).")
    
        tools[toolname].set_rootdir(rootdir)
    
        # Basic verification
        tools[toolname].ensure_image()
    
        # Build the tool on need
        tools[toolname].build(rootdir=rootdir)
    
        # build list of test executions for run function
        work_items = []
        for number, test in enumerate(todo):
            binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
            work_items.append(
                (test['cmd'], test['filename'], binary, test['id'], number, args.timeout, batchinfo, args.loglevel))
    
        with mp.Pool(args.nworkers) as pool:
            pool.starmap(tools[toolname].run, work_items)
        # for test in todo:
        #     binary = re.sub('\.c', '', os.path.basename(test['filename']))
    
        #     print(f"\nTest #{count} out of {len(todo)}: '{binary}_{test['id']} '", end="... ")
        #     count += 1
        #     sys.stdout.flush()
    
        #     p = mp.Process(target=tools[toolname].run, args=(test['cmd'], test['filename'], binary, test['id'], args.timeout, batchinfo))
        #     with mp.Pool(5) as pool:
        #         pool.starmap()
        #     p.start()
        #     sys.stdout.flush()
        #     p.join(args.timeout+60)
        #     if p.is_alive():
        #         print("HARD TIMEOUT! The child process failed to timeout by itself. Sorry for the output.")
        #         p.terminate()
    
        tools[toolname].teardown()
    
    
    ########################
    # cmd_html(): what to do when '-c html' is used (extract the statistics of this tool)
    ########################
    def percent(num, den, compl=False, one=False, digits=4):
        """Returns the ratio of num/den as a percentage, rounded to N digits only (default: 4). If one=True, then return a ratio of 1 with 4 digits"""
        if den == 0:
            return "(error)"
        elif compl:  # Complementary
            res = round(100 - num / den * 100, digits - 2)
        else:
            res = round(num / den * 100, 2)
        if int(res) == 100:
            return "1" if one else "100"
        return round(res / 100, digits) if one else res
    
    
    def bold_if(val, target):
        """Returns the value as a bold LaTeX string if it equals the target, or unchanged otherwise."""
        if str(val) == str(target):
            return f'{{\\bf {val}}}'
        return str(val)
    
    
    def seconds2human(secs):
        """Returns the amount of seconds in human-friendly way"""
        days = int(secs // 86400)
        hours = int((secs - days * 86400) // 3600)
        minutes = int((secs - days * 86400 - hours * 3600) // 60)
        seconds = secs - days * 86400 - hours * 3600 - minutes * 60
        return (f"{days} days, " if days else "") + (f"{hours} hours, " if hours else "") + (
            f"{minutes} minutes, " if minutes else "") + (f"{int(seconds * 100) / 100} seconds" if seconds else "")
    
    
    def cmd_html(rootdir, toolnames=[]):
        here = os.getcwd()
        os.chdir(rootdir)
        results = {}
        total_elapsed = {}
        used_toolnames = []
        for toolname in toolnames:
            if not toolname in tools:
                raise Exception(f"Tool {toolname} does not seem to be a valid name.")
    
            if os.path.exists(f'{args.logs_dir}/{toolname}'):
                used_toolnames.append(toolname)
                # To compute statistics on the performance of this tool
                results[toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [], 'TRUE_NEG': [],
                                     'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': []}
    
                # To compute timing statistics
                total_elapsed[toolname] = 0
    
        ########################
        # Analyse each test, grouped by expectation, and all tools for a given test
        ########################
        with open(f"{rootdir}/index.html", "w") as outHTML:
            outHTML.write("""
    <html><head><title>MBI results</title></head>
    <script>
    iframe {
      resize: both;
      overflow: auto;
    }
    </script>
    <body>
    <iframe width="100%" height="45%" src="summary.html"></iframe>
    <iframe width="100%" height="55%" name="MBB_details"></iframe>
    </body></html>
    """)
    
        with open(f"{rootdir}/summary.html", "w") as outHTML:
            outHTML.write(f"<html><head><title>MBB outcomes for all tests</title></head>\n")
            outHTML.write("""
    <style>
    .tooltip {
      position: relative;
      display: inline-block;
      border-bottom: 1px dotted black; /* If you want dots under the hoverable text */
    }
    
    .tooltip .tooltiptext {
      visibility: hidden;
      width: 120px;
      background-color: #555;
      color: #fff;
      text-align: center;
      border-radius: 6px;
      padding: 5px 0;
      position: absolute;
      z-index: 1;
      bottom: 125%;
      left: 50%;
      margin-left: -60px;
      opacity: 0;
      transition: opacity 0.3s;
    }
    
    .tooltip .tooltiptext::after {
      content: "";
      position: absolute;
      top: 100%;
      left: 50%;
      margin-left: -5px;
      border-width: 5px;
      border-style: solid;
      border-color: #555 transparent transparent transparent;
    }
    
    .tooltip:hover .tooltiptext {
      visibility: visible;
      opacity: 1;
    }
    </style>
    <body>
    """)
    
            # Generate the table of contents
            previous_detail = ''  # To open a new section for each possible detailed outcome
            outHTML.write("<h2>Table of contents</h2>\n<ul>\n")
            for test in sorted(todo,
                               key=lambda t: f"{possible_details[t['detail']]}|{t['detail']}|{t['filename']}|{t['id']}"):
                if previous_detail != possible_details[test['detail']]:
                    if previous_detail != '':  # Close the previous item, if we are not generating the first one
                        outHTML.write(f" </li>\n")
                    previous_detail = possible_details[test['detail']]
                    if test['detail'] != 'OK':
                        outHTML.write(
                            f" <li><a href='#{possible_details[test['detail']]}'>{displayed_name[possible_details[test['detail']]]}</a> (scope: {error_scope[possible_details[test['detail']]]})\n")
                    else:
                        outHTML.write(f" <li><a href='#OK'>{displayed_name[possible_details[test['detail']]]}</a>\n")
    
            outHTML.write("  </ul>\n <li><a href='#metrics'>Summary metrics</a></li></ul>\n")
    
            # Generate the actual content
            previous_detail = ''  # To open a new section for each possible detailed outcome
            testcount = 0  # To repeat the table header every 25 lines
            for test in sorted(todo,
                               key=lambda t: f"{possible_details[t['detail']]}|{t['detail']}|{t['filename']}|{t['id']}"):
                testcount += 1
                if previous_detail != possible_details[test['detail']] or testcount == 25:
                    if testcount != 25:  # Write the expected outcome only once, not every 25 tests
                        if previous_detail != '':  # Close the previous table, if we are not generating the first one
                            outHTML.write(f"</table>\n")
                        previous_detail = possible_details[test['detail']]
                        if test['detail'] != 'OK':
                            outHTML.write(
                                f"  <a name='{possible_details[test['detail']]}'/><h3>{displayed_name[possible_details[test['detail']]]} errors (scope: {error_scope[possible_details[test['detail']]]})</h3>\n")
                        else:
                            outHTML.write(f"  <a name='OK'/><h3>Correct codes</h3>\n")
    
                        outHTML.write('  <table border=1>\n')
                    testcount = 0
                    outHTML.write("   <tr><td>Test</td>")
                    for toolname in used_toolnames:
                        outHTML.write(f"<td>&nbsp;{displayed_name[toolname]}&nbsp;</td>")
                    outHTML.write(f"</tr>\n")
                outHTML.write(f"     <tr>")
    
                binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
                ID = test['id']
                test_id = f"{binary}_{ID}"
                expected = test['expect']
    
                outHTML.write(
                    f"<td><a href='{test['filename']}' target='MBB_details'>{binary}</a>&nbsp;<a href='{test['filename']}'><img title='Download source' src='img/html.svg' height='24' /></a>")
                if ID != 0:
                    outHTML.write(f' (test {ID + 1}) ')
                outHTML.write("</td>")
    
                for toolname in used_toolnames:
                    (res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname,
                                                                              test_id=test_id, logs_dir=args.logs_dir,
                                                                              expected=expected, autoclean=True)
    
                    results[toolname][res_category].append(f"{test_id} expected {test['detail']}, outcome: {diagnostic}")
                    outHTML.write(
                        f"<td align='center'><a href='{args.logs_dir}/{toolname}/{test_id}.txt' target='MBB_details'><img title='{displayed_name[toolname]} {diagnostic} (returned {outcome})' src='img/{res_category}.svg' width='24' /></a> ({outcome})")
                    extra = None
    
                    report = []
                    for root, dirs, files in os.walk(f"{args.logs_dir}/{toolname}/{test_id}"):
                        if "index.html" in files:
                            report.append(os.path.join(root, "index.html"))
    
                    if len(report) > 0:
                        extra = f'{args.logs_dir}/' + report[0].split(f'{args.logs_dir}/')[1]
                    if os.path.exists(f'{args.logs_dir}/{toolname}/{test_id}.html'):
                        extra = f'{args.logs_dir}/{toolname}/{test_id}.html'
                    if os.path.exists(f'{args.logs_dir}/{toolname}/{test_id}-klee-out'):  # MPI-SV
                        extra = f'{args.logs_dir}/{toolname}/{test_id}-klee-out'
    
                    if extra is not None:
                        outHTML.write(
                            f"&nbsp;<a href='{extra}' target='MBB_details'><img title='more info' src='img/html.svg' height='24' /></a>")
                    outHTML.write("</td>")
    
                    if res_category != 'timeout' and elapsed is not None:
                        total_elapsed[toolname] += float(elapsed)
    
                    if len(used_toolnames) == 1:
                        print(f"Test '{test_id}' result: {res_category}: {diagnostic}. Elapsed: {elapsed} sec")
    
                    np = re.search(r"(?:-np) [0-9]+", test['cmd'])
                    np = int(re.sub(r"-np ", "", np.group(0)))
    
                outHTML.write(f"</tr>\n")
            outHTML.write(f"</table>\n")
    
            # Display summary metrics for each tool
            def tool_stats(toolname):
                return (
                    len(results[toolname]['TRUE_POS']), len(results[toolname]['TRUE_NEG']),
                    len(results[toolname]['FALSE_POS']),
                    len(results[toolname]['FALSE_NEG']), len(results[toolname]['unimplemented']),
                    len(results[toolname]['failure']), len(results[toolname]['timeout']), len(results[toolname]['other']))
    
            outHTML.write("\n<a name='metrics'/><h2>Metrics</h2><table border=1>\n<tr><td/>\n")
            for toolname in used_toolnames:
                outHTML.write(f"<td>{displayed_name[toolname]}</td>")
    
            outHTML.write("</tr>\n<tr><td>API coverage</td>")
            for toolname in used_toolnames:
                (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
                total = TP + TN + FP + FN + nTout + nPort + nFail + nNocc
                outHTML.write(
                    f"<td><div class='tooltip'>{percent(nPort, total, compl=True)}% <span class='tooltiptext'>{nPort} unimplemented calls, {nNocc} inconclusive runs out of {total}</span></div></td>")
    
            outHTML.write("</tr>\n<tr><td>Robustness</td>")
            for toolname in used_toolnames:
                (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
                totalPort = TP + TN + FP + FN + nTout + nFail
                outHTML.write(
                    f"<td><div class='tooltip'>{percent((nTout + nFail), (totalPort), compl=True)}% <span class='tooltiptext'>{nTout} timeouts, {nFail} failures out of {totalPort}</span></div></td>")
    
            outHTML.write("</tr>\n<tr><td>Recall</td>")
            for toolname in used_toolnames:
                (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
                outHTML.write(
                    f"<td><div class='tooltip'>{percent(TP, (TP + FN))}% <span class='tooltiptext'>found {TP} errors out of {TP + FN}</span></div></td>")
            outHTML.write("</tr>\n<tr><td>Specificity</td>")
            for toolname in used_toolnames:
                (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
                outHTML.write(
                    f"<td><div class='tooltip'>{percent(TN, (TN + FP))}%  <span class='tooltiptext'>recognized {TN} correct codes out of {TN + FP}</span></div></td>")
            outHTML.write("</tr>\n<tr><td>Precision</td>")
            for toolname in used_toolnames:
                (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
                outHTML.write(
                    f"<td><div class='tooltip'>{percent(TP, (TP + FP))}% <span class='tooltiptext'>{TP} diagnostics of error are correct out of {TP + FP})</span></div></td>")
            outHTML.write("</tr>\n<tr><td>Accuracy</td>")
            for toolname in used_toolnames:
                (TP, TN, FP, FN, nPort, nFail, nTout, nNocc) = tool_stats(toolname)
                outHTML.write(
                    f"<td><div class='tooltip'>{percent((TP + TN), (TP + TN + FP + FN))}% <span class='tooltiptext'>{TP + TN} correct diagnostics in total, out of {TP + TN + FP + FN} diagnostics</span></div></td>")
            outHTML.write("</tr></table>")
            outHTML.write(
                "<p>Hover over the values for details. API coverage issues, timeouts and failures are not considered when computing the other metrics, thus differences in the total amount of tests.</p>")
    
            # Add generate radar plots
            if plots_loaded:
                for toolname in used_toolnames:
                    outHTML.write(
                        f'<img src="plots/ext_radar_all_{toolname}.svg" alt="Radar plot for all error type for the {displayed_name[toolname]} tool."\\>')
    
            outHTML.write(f"</body></html>\n")
    
        ########################
        # Per tool statistics summary
        ########################
        for toolname in used_toolnames:
            TP = len(results[toolname]['TRUE_POS'])
            TN = len(results[toolname]['TRUE_NEG'])
            FP = len(results[toolname]['FALSE_POS'])
            FN = len(results[toolname]['FALSE_NEG'])
            nPort = len(results[toolname]['unimplemented'])
            nFail = len(results[toolname]['failure'])
            other = len(results[toolname]['other'])
            nTout = len(results[toolname]['timeout'])
            passed = TP + TN
            total = passed + FP + FN + nTout + nPort + nFail + other
    
            print(f"XXXXXXXXX Final results for {toolname}")
            if FP > 0:
                print(f"XXX {FP} false positives")
                if len(used_toolnames) == 1:
                    for p in results[toolname]['FALSE_POS']:
                        print(f"  {p}")
            if FN > 0:
                print(f"XXX {FN} false negatives")
                if len(used_toolnames) == 1:
                    for p in results[toolname]['FALSE_NEG']:
                        print(f"  {p}")
            if nTout > 0:
                print(f"XXX {nTout} timeouts")
                if len(used_toolnames) == 1:
                    for p in results[toolname]['timeout']:
                        print(f"  {p}")
            if nPort > 0:
                print(f"XXX {nPort} API coverage issues")
                if len(used_toolnames) == 1:
                    for p in results[toolname]['unimplemented']:
                        print(f"  {p}")
            if nFail > 0:
                print(f"XXX {nFail} tool failures")
                if len(used_toolnames) == 1:
                    for p in results[toolname]['failure']:
                        print(f"  {p}")
            if other > 0:
                print(f"XXX {nFail} inconclusive runs (output parsing failure)")
                if len(used_toolnames) == 1:
                    for p in results[toolname]['other']:
                        print(f"  {p}")
    
            print(f"\nXXXX Summary for {toolname} XXXX  {passed} test{'' if passed == 1 else 's'} passed (out of {total})")
            print(f"\nFP = {FP}  FN = {FN}  TP = {TP}  TN = {TN}")
            print(f"\nCE = {nPort}  TO = {nTout}  RE = {nFail}")
            print(f"API coverage: {percent(nPort, total, compl=True)}% ({nPort} tests failed out of {total})")
            print(
                f"Robustness: {percent((nTout + nFail), (total - nPort), compl=True)}% ({nTout} timeouts and {nFail} failures out of {total - nPort})\n")
    
            print(f"Recall: {percent(TP, (TP + FN))}% (found {TP} errors out of {TP + FN})")
            print(f"Specificity: {percent(TN, (TN + FP))}% (recognized {TN} correct codes out of {TN + FP})")
            print(f"Precision: {percent(TP, (TP + FP))}% ({TP} diagnostic of error are correct out of {TP + FP})")
            print(
                f"Accuracy: {percent((TP + TN), (TP + TN + FP + FN))}% ({TP + TN} correct diagnostics in total, out of {TP + TN + FP + FN} diagnostics)")
            print(
                f"\nTotal time of {toolname} for all tests (not counting the timeouts): {seconds2human(total_elapsed[toolname])} ({total_elapsed[toolname]} seconds)")
    
        os.chdir(here)
    
    
    # expects a df with at least ["ERROR_EXPECTED","any_error_reported","TP","category"]
    # classifies as FN,FP,TN,...
    def classify_tests(df_in):
        df = df_in[["test_id", "ERROR_EXPECTED", "any_error_reported", "category", "CE", "RE", "TP"]].copy()
    
        df["TN"] = (df["ERROR_EXPECTED"] == False) & (df["any_error_reported"] == False) & (df["CE"] == False) & (
                    df["RE"] == False)
        df["FN"] = (df["ERROR_EXPECTED"] == True) & (df["any_error_reported"] == False) & (df["CE"] == False) & (
                    df["RE"] == False)
        df["FP"] = (((df["ERROR_EXPECTED"] == False) & df["any_error_reported"]) |  # a true false positive
                    # or a case where a not-helpful report is produced
                    ((df["ERROR_EXPECTED"] == True) & df["any_error_reported"] & (df["TP"] == False))) & (
                               df["CE"] == False) & (df["RE"] == False)
    
        # so that this information is available per category
        df["ERROR_NOT_EXPECTED"] = (df["ERROR_EXPECTED"] == False)
    
        # every case is exactely one of this
        assert df["TP"].sum() + df["FP"].sum() + df["TN"].sum() + df["FN"].sum() + df["CE"].sum() + df["RE"].sum() == len(
            df)
        assert df["ERROR_EXPECTED"].sum() + df["ERROR_NOT_EXPECTED"].sum() == len(df)
    
        return df
    
    
    # aggregate metrics and calculate precision recall F1 based on this
    def aggregate_metrics_per_category(df_in):
        total_tests = len(df_in)
        df = df_in.groupby(["category"]).sum()
        df.loc["ALL"] = df.sum(axis=0)
    
        df["recall"] = df["TP"] / (df["ERROR_EXPECTED"])
        df["precision"] = df["TP"] / (df["TP"] + df["FP"])
        df["specificity"] = df["TN"] / (df["ERROR_NOT_EXPECTED"])
        df["overallaccuracy"] = (df["TP"] + df["TN"]) / total_tests
        df["coverage"] = 1 - (df["CE"]) / total_tests
        df["conclusiveness"] = 1 - ((df["CE"] + df["RE"]) / total_tests)
        df["f1"] = (df["TP"] + df["TP"]) / (df["TP"] + df["TP"] + df["FP"] + df["FN"])
    
        return df[
            ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall", "precision", "f1",
             "overallaccuracy"]]
    
    
    def read_tool_reports(rootdir, toolname):
        if not toolname in tools:
            raise Exception(f"Tool {toolname} does not seem to be a valid name.")
    
        if not os.path.exists(f'{args.logs_dir}/{toolname}'):
            raise Exception(f"Not found Logs for {toolname}.")
    
        results = []
    
        for test in todo:
            binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
            ID = test['id']
            test_category = test['category']
            test_id = f"{binary}_{ID}"
            expected = test['expect']
    
            resulting_categorization = categorize(tool=tools[toolname], toolname=toolname,
                                                  test=test, test_id=test_id, logs_dir=args.logs_dir,
                                                  )
            resulting_categorization["test_id"] = test_id
            resulting_categorization["category"] = test["category"]
            results.append(resulting_categorization)
    
        df = pd.DataFrame(results)
    
        df["TP_base"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & (df["CE"] == False) & (df["RE"] == False)
        df["TP_class"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & (
                    df["CE"] == False) & (df["RE"] == False)
        df["TP_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_line_reported"] & (
                    df["CE"] == False) & (df["RE"] == False)
        df["TP_class_line"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df["correct_class_reported"] & df[
            "correct_line_reported"] & (df["CE"] == False) & (df["RE"] == False)
        df["TP_class_line_no_class_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[
            "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_class"]) & (
                                                         df["CE"] == False) & (df["RE"] == False)
        df["TP_class_line_no_line_noise"] = df["ERROR_EXPECTED"] & df["any_error_reported"] & df[
            "correct_class_reported"] & df["correct_line_reported"] & (~df["contains_noise_line"]) & (df["CE"] == False) & (
                                                        df["RE"] == False)
    
        return df
    
    
    def cmd_csv(rootdir, toolnames, print_to_console=False):
        here = os.getcwd()
        os.chdir(rootdir)
        outpath = f'{rootdir}/csv/'
    
        # Create directory for output if not present
        pathlib.Path(outpath).mkdir(parents=True, exist_ok=True)
    
        df_noise_ratio = pd.DataFrame(columns=toolnames)
        df_overall_noise_ratio = pd.DataFrame(columns=toolnames)
    
        pd.set_option('display.max_columns', 14)
    
        for toolname in toolnames:
            df = read_tool_reports(rootdir, toolname)
            df.to_csv(f'{outpath}/{toolname}_raw.csv', index=False)
            if print_to_console:
                print(f"=== {toolname} ===")
    
            # Output for each type of TP
            for (colname) in ["base", "class", "line", "class_line", "class_line_no_line_noise", "class_line_no_line_noise",
                              "class_line_no_class_noise"]:
                df["TP"] = df[f"TP_{colname}"]
                df_classified = classify_tests(df)
                df_classified.to_csv(f'{outpath}/{toolname}_{colname}_full.csv', index=False)
                df_result = aggregate_metrics_per_category(df_classified)
                df_result.to_csv(f'{outpath}/{toolname}_{colname}.csv', index=True)
                if print_to_console:
                    print(f"\n{colname}:")
                    print(df_result[
                              ["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall",
                               "precision", "f1", "overallaccuracy"]])
                    df_result[["CE", "RE", "TP", "TN", "FP", "FN", "coverage", "conclusiveness", "specificity", "recall",
                               "precision", "f1", "overallaccuracy"]].style.format(precision=2).to_latex(
                        f'{outpath}/{toolname}_{colname}.tex')
    
            df_noise_per_tool = df.groupby("category").sum()
            df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0)
            df_noise_per_tool.drop("other", axis=0, inplace=True)
            df_noise_per_tool["noise_ratio"] = df_noise_per_tool["num_noise_line"] / df_noise_per_tool["num_error_reports"]
            if print_to_console:
                print("overall_noise")
                print(df_noise_per_tool["noise_ratio"])
            df_overall_noise_ratio[toolname] = df_noise_per_tool["noise_ratio"]
    
            df_copy = df.copy()
            df_copy.loc[df_copy['ERROR_EXPECTED'] == False, ['num_noise_class_line', 'num_error_reports']] = 0
            df_noise_per_tool = df_copy.groupby("category").sum()
            df_noise_per_tool.loc["ALL"] = df_noise_per_tool.sum(axis=0)
            df_noise_per_tool.drop("other", axis=0, inplace=True)
            df_noise_per_tool["noise_ratio"] = df_noise_per_tool["num_noise_line"] / df_noise_per_tool["num_error_reports"]
            if print_to_console:
                print("noise_in_cases_where_errors_are_present")
                print(df_noise_per_tool[["noise_ratio", "num_noise_class_line", "num_error_reports"]])
            df_noise_ratio[toolname] = df_noise_per_tool["noise_ratio"]
    
        df_noise_ratio.to_csv(f'{outpath}/noise.csv')
        df_overall_noise_ratio.to_csv(f'{outpath}/overall_noise_including_unexpected.csv')
    
    
    def plot_helpfulness(df, outpath, toolname):
        SMALL_SIZE = 16
        MEDIUM_SIZE = 16
        BIGGER_SIZE = 16
    
        plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
        plt.rc('axes', titlesize=BIGGER_SIZE)  # fontsize of the axes title
        plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
        plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
        plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
        df_plot = df.groupby("category").sum()
        df_plot.loc["ALL"] = df_plot.sum(axis=0)
        df_plot.drop("other", axis=0, inplace=True)
        df_plot["TP_class"] = df_plot["TP_class"] - df_plot["TP_class_line"]
        df_plot["TP_line"] = df_plot["TP_line"] - df_plot["TP_class_line"]
        df_plot["TP_base"] = df_plot["TP_base"] - df_plot["TP_class_line"] - df_plot["TP_class"] - df_plot["TP_line"]
        colors = ['#88CCEE', '#88CCEE', '#44AA99', '#EE6677']
        # colors = ['#66CCEE', 'yellow', '#228833', '#EE6677']
        fig, ax = plt.subplots(1, 1, figsize=(9, 6))
        df_plot = df_plot[["TP_class", "TP_class_line", "TP_line", "TP_base"]]
        df_plot[["TP_class", "TP_class_line", "TP_line", "TP_base"]].div(df_plot.sum(axis=1), axis=0).plot.barh(
            stacked=True, color=colors, ax=ax, legend=False)
        # Customize bars
        bars = ax.patches
        for i in [4, 5, 6, 7]:
            bars[i].set_hatch("//")
            bars[i].set_edgecolor(colors[2])
        # Create custom legend handles
        handles = [
            mpatches.Patch(color=colors[0], label="correct error class"),
            mpatches.Patch(facecolor=colors[1], edgecolor=colors[2], hatch='//', label='correct class and line'),
            mpatches.Patch(color=colors[2], label='correct source line'),
            mpatches.Patch(color=colors[3], label='not helpful report')
        ]
        ax.legend(handles=handles, ncol=2, loc='center left', bbox_to_anchor=(0.05, -0.3))
        # ax.set_title(f"Helpfulness of {toolname.upper()} Error Reports")
        ax.set_xlabel("Percentage of error reports")
        ax.set_ylabel("MPI feature")
        plt.tight_layout()
        plt.savefig(f'{outpath}/helpfulness_{toolname}_plot.pdf')
    
    
    def cmd_latex(rootdir, toolnames):
        here = os.getcwd()
        os.chdir(rootdir)
        results = {}
        total_elapsed = {}
        used_toolnames = []
    
        # select the tools for which we have some results
        print("Produce the stats for:", end='')
        for toolname in toolnames:
            if not toolname in tools:
                raise Exception(f"Tool {toolname} does not seem to be a valid name.")
    
            if os.path.exists(f'{args.logs_dir}/{toolname}'):
                used_toolnames.append(toolname)
                print(f' {toolname}', end="")
    
                # To compute timing statistics
                total_elapsed[toolname] = 0
        print(".")
    
        test_categories = ['COLL', 'P2P', 'RMA', 'other']
    
        # Initialize the data structure to gather all results
        results = {'total': {}, 'error': {}}
        timing = {'total': {}, 'error': {}}
    
        for test_category in test_categories:
            results[test_category] = {}
            timing[test_category] = {}
    
        for error in error_scope:
            results[error] = {}
            timing[error] = {}
            for toolname in used_toolnames:
                results[error][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [], 'TRUE_NEG': [],
                                            'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': []}
                results['total'][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [],
                                              'TRUE_NEG': [], 'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': [], 'error': [],
                                              'OK': []}
                results['error'][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [],
                                              'TRUE_NEG': [], 'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': [], 'error': [],
                                              'OK': []}
                timing[error][toolname] = []
                timing['total'][toolname] = []
                timing['error'][toolname] = []
                for test_category in test_categories:
                    results[test_category][toolname] = {'failure': [], 'timeout': [], 'unimplemented': [], 'other': [],
                                                        'TRUE_NEG': [], 'TRUE_POS': [], 'FALSE_NEG': [], 'FALSE_POS': [],
                                                        'error': [], 'OK': []}
                    timing[test_category][toolname] = []
    
        # Get all data from the caches
        for test in todo:
            binary = re.sub(r'\.c', '', os.path.basename(test['filename']))
            ID = test['id']
            test_category = test['category']
            test_id = f"{binary}_{ID}"
            expected = test['expect']
    
            for toolname in used_toolnames:
                (res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname,
                                                                          test_id=test_id, logs_dir=args.logs_dir,
                                                                          expected=expected)
                ##(res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname, test_id=test_id, expected=expected)
                error = possible_details[test['detail']]
                results[error][toolname][res_category].append(test_id)
                results['total'][toolname][res_category].append(test_id)
                results[test_category][toolname][res_category].append(test_id)
                timing[error][toolname].append(float(elapsed))
                timing['total'][toolname].append(float(elapsed))
                timing[test_category][toolname].append(float(elapsed))
                if expected == 'OK':
                    results['total'][toolname]['OK'].append(test_id)
                    results[test_category][toolname]['OK'].append(test_id)
                else:
                    results['total'][toolname]['error'].append(test_id)
                    results[test_category][toolname]['error'].append(test_id)
                    results['error'][toolname][res_category].append(test_id)
                    timing['error'][toolname].append(float(elapsed))
    
        # Create directory for output if not present
        pathlib.Path(f'{rootdir}/latex/').mkdir(parents=True, exist_ok=True)
    
        # Produce the results per tool and per category
        with open(f'{rootdir}/latex/results-per-category-landscape.tex', 'w') as outfile:
            outfile.write('\\setlength\\tabcolsep{3pt} % default value: 6pt\n')
            outfile.write("\\begin{tabular}{|l|*{" + str(len(used_toolnames)) + "}{c|c|c|c||}}\n")
            outfile.write("\\cline{2-" + str(len(used_toolnames) * 4 + 1) + "}\n")
            # First title line: Tool names
            outfile.write("  \\multicolumn{1}{c|}{}")
            for t in used_toolnames:
                outfile.write("& \\multicolumn{4}{c||}{" + displayed_name[t] + "}")
            outfile.write("\\\\\n")
            outfile.write("\\cline{2-" + str(len(used_toolnames) * 4 + 1) + "}\n")
            # Second title line: TP&TN&FP&FN per tool
            outfile.write("  \\multicolumn{1}{c|}{}")
            for t in used_toolnames:
                outfile.write(
                    "& \\rotatebox{90}{Build error~~} &\\rotatebox{90}{Failure} & \\rotatebox{90}{Incorrect} & \\rotatebox{90}{Correct~~} ")
            outfile.write("\\\\\\hline\n")
    
            for error in error_scope:
                if error == 'FOK':
                    outfile.write("\\hline\n")
                outfile.write(displayed_name[error])
                for toolname in used_toolnames:
                    port = len(results[error][toolname]['unimplemented'])
                    othr = len(results[error][toolname]['other'])
                    fail = len(results[error][toolname]['failure'])
                    tout = len(results[error][toolname]['timeout'])
                    good = len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                    bad = len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])
                    outfile.write(f"&{port + othr} & {fail + tout} &{bad}&{good}")
                    # results[error][toolname] = {'failure':[], 'timeout':[], 'unimplemented':[], 'other':[], 'TRUE_NEG':[], 'TRUE_POS':[], 'FALSE_NEG':[], 'FALSE_POS':[]}
                outfile.write("\\\\\\hline\n")
            outfile.write("\\hline\n \\textbf{Total}")
            for toolname in used_toolnames:
                port = othr = fail = tout = good = bad = 0
                for error in error_scope:
                    port += len(results[error][toolname]['unimplemented'])
                    othr += len(results[error][toolname]['other'])
                    fail += len(results[error][toolname]['failure'])
                    tout += len(results[error][toolname]['timeout'])
                    good += len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                    bad += len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])
                outfile.write(f"&{port + othr} & {fail + tout} &{bad}&{good}")
            outfile.write("\\\\\\hline\n")
    
            # Finish the table
            outfile.write("\\end{tabular}\n")
            outfile.write('\\setlength\\tabcolsep{6pt} % Back to default value\n')
    
        # Produce the results per tool and per category
        with open(f'{rootdir}/latex/results-per-category-portrait.tex', 'w') as outfile:
            outfile.write('\\setlength\\tabcolsep{1.5pt} % default value: 6pt\n')
            # To split the table in two lines, do this: for errors in [['FOK','AInvalidParam','BResLeak','BReqLifecycle','BLocalConcurrency'], ['CMatch','DRace','DMatch','DGlobalConcurrency','EBufferingHazard']]:
            for errors in [
                ['FOK', 'AInvalidParam', 'BResLeak', 'BReqLifecycle', 'BLocalConcurrency', 'CMatch', 'DRace', 'DMatch',
                 'DGlobalConcurrency']]:
                outfile.write("\\begin{tabular}{|l@{}|*{" + str(
                    len(errors) - 1) + "}{c|c|c|c||} c|c|c|c|}\n")  # last column not in multiplier (len-1 used) to not have || at the end
                outfile.write(f"\\cline{{2-{len(errors) * 4 + 1}}}\n")
                # First title line: error categories
                outfile.write("  \\multicolumn{1}{c|}{}")
                for error in errors:
                    sep = '|' if error == errors[-1] else '||'  # Use || as a separator, unless that's the last column
                    outfile.write(f"&\\multicolumn{{4}}{{c{sep}}}{{{displayed_name[error].split(' ')[0]}}}")
                outfile.write("\\\\\n  \\multicolumn{1}{c|}{}")
                for error in errors:
                    sep = '|' if error == errors[-1] else '||'  # Use || as a separator, unless that's the last column
                    outfile.write(f"&\\multicolumn{{4}}{{c{sep}}}{{{displayed_name[error].split(' ')[1]}}}")
                outfile.write(f"\\\\\\cline{{2-{len(errors) * 4 + 1}}}\n")
                outfile.write("\\multicolumn{1}{c|}{}")
                for error in errors:
                    outfile.write(
                        "& \\rotatebox{90}{Build error~~} & \\rotatebox{90}{Runtime error} &")  # \\rotatebox{90}{Timeout~~}&
                    if error == 'FOK':
                        outfile.write(
                            " \\rotatebox{90}{False \\textbf{Positive}} & \\rotatebox{90}{True \\textbf{Negative}~~} \n")
                    else:
                        outfile.write(" \\rotatebox{90}{False Negative} & \\rotatebox{90}{True Positive~} \n")
                outfile.write("\\\\\\hline\n")
    
                # Find the best tool
                best = {}
                for error in errors:
                    best[error] = 0
                    for toolname in used_toolnames:
                        val = len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                        if val > best[error]:
                            best[error] = val
                    # print(f"Best for {error} has {best[error]}")
    
                # display all tools
                for toolname in used_toolnames:
                    outfile.write(f'{displayed_name[toolname]}')
                    for error in errors:
                        port = len(results[error][toolname]['unimplemented'])
                        othr = len(results[error][toolname]['other'])
                        fail = len(results[error][toolname]['failure'])
                        tout = len(results[error][toolname]['timeout'])
                        good = len(results[error][toolname]['TRUE_POS']) + len(results[error][toolname]['TRUE_NEG'])
                        bad = len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])
                        if good == best[error]:  # Best tool is diplayed in bold
                            outfile.write(f"&{{\\bf {port}}}&{{\\bf {tout + othr + fail}}}&{{\\bf {bad}}}&{{\\bf {good}}}")
                        else:
                            outfile.write(f"&{port}&{tout + othr + fail}&{bad}&{good}")
                    outfile.write("\\\\\\hline\n")
    
                outfile.write("\\hline\\textit{Ideal tool}")
                for error in errors:
                    toolname = used_toolnames[0]
                    total = len(results[error][toolname]['unimplemented']) + len(results[error][toolname]['other']) + len(
                        results[error][toolname]['failure'])
                    total += len(results[error][toolname]['timeout']) + len(results[error][toolname]['TRUE_POS']) + len(
                        results[error][toolname]['TRUE_NEG'])
                    total += len(results[error][toolname]['FALSE_POS']) + len(results[error][toolname]['FALSE_NEG'])
    
                    outfile.write(f"& \\textit{{0}} &\\textit{{0}} & \\textit{{0}} & \\textit{total} \n")
                outfile.write("\\\\\\hline\n")
    
                # Finish the table
                outfile.write("\\end{tabular}\n\n\\medskip\n")
            outfile.write('\\setlength\\tabcolsep{6pt} % Back to default value\n')
    
        for test_category in ['total'] + test_categories:
            # Produce the landscape results+metric per tool for all category
            with open(f'{rootdir}/latex/results-{test_category}.tex', 'w') as outfile:
                outfile.write('\\setlength\\tabcolsep{2pt} % default value: 6pt\n')
                outfile.write('\\begin{tabular}{|l|*{3}{c|}|*{4}{c|}|*{2}{c|}|*{4}{c|}|c|}\\hline\n')
                outfile.write(
                    '  \\multirow{2}{*}{ \\textbf{Tool}} &  \\multicolumn{3}{c||}{Errors} &\\multicolumn{4}{c||}{Results}&\\multicolumn{2}{c||}{Robustness} &\\multicolumn{4}{c||}{Usefulness}&\\textbf{Overall}\\\\\\cline{2-14}\n')
                outfile.write(
                    '& \\textbf{CE}&\\textbf{TO}&\\textbf{RE}  & \\textbf{TP} & \\textbf{TN} & \\textbf{FP} & \\textbf{FN} &\\textbf{Coverage} & \\textbf{Conclusiveness} & \\textbf{Specificity}&\\textbf{Recall}& \\textbf{Precision}& \\textbf{F1 Score}    & \\textbf{accuracy}\\\\\\hline \n')
    
                # Search the best values
                best = {'TP': 0, 'TN': 0, 'FP': 999999, 'FN': 9999999, 'coverage': 0, 'completion': 0, 'specificity': 0,
                        'recall': 0, 'precision': 0, 'F1': 0, 'accuracy': 0}
                for toolname in used_toolnames:
                    TP = len(results[test_category][toolname]['TRUE_POS'])
                    TN = len(results[test_category][toolname]['TRUE_NEG'])
                    FN = len(results[test_category][toolname]['FALSE_NEG'])
                    FP = len(results[test_category][toolname]['FALSE_POS'])
                    if TP > best['TP']:
                        best['TP'] = TP
                    if TN > best['TN']:
                        best['TN'] = TN
                    if FP < best['FP']:
                        best['FP'] = FP
                    if FN < best['FN']:
                        best['FN'] = FN
    
                    port = len(results[test_category][toolname]['unimplemented'])
                    fail = len(results[test_category][toolname]['failure'])
                    othr = len(results[test_category][toolname]['other'])
                    tout = len(results[test_category][toolname]['timeout'])
                    total = TP + TN + FP + FN + port + fail + othr + tout
                    if (TN + FP) != 0 and TP + FN != 0 and TP + FP != 0:
                        coverage = float(percent(port, total, compl=True, one=True, digits=2))
                        if coverage > best['coverage']:
                            best['coverage'] = coverage
                        completion = float(percent((port + fail + othr + tout), (total), compl=True, one=True, digits=2))
                        if completion > best['completion']:
                            best['completion'] = completion
                        specificity = float(percent(TN, (TN + FP), one=True, digits=2))
                        if specificity > best['specificity']:
                            best['specificity'] = specificity
                        recall = float(percent(TP, (TP + FN), one=True, digits=2))
                        if recall > best['recall']:
                            best['recall'] = recall
                        precision = float(percent(TP, (TP + FP), one=True, digits=2))
                        if precision > best['precision']:
                            best['precision'] = precision
    
                        # Recompute precision & recall without rounding, to match the value computed when displaying the result
                        precision = TP / (TP + FP)
                        recall = TP / (TP + FN)
                        F1 = percent(2 * precision * recall, (precision + recall), one=True)
                        if F1 > best['F1']:
                            best['F1'] = F1
                        accuracy = percent(TP + TN, (TP + TN + FP + FN + port + fail + othr + tout), one=True)
                        if accuracy > best['accuracy']:
                            best['accuracy'] = accuracy
                    else:
                        print(
                            f"WARNING: {toolname} not considered as a best score: TN+FP={TP + FP} TP+FN={TP + FN} TP+FP={TP + FP}")
    
                for key in best:  # Cleanup the data to ensure that the equality test matches in bold_if()
                    if best[key] == 1.0:
                        best[key] = "1"
                print(f"best coverage: {best['coverage']}")
                print(f"best: {best}")
    
                for toolname in used_toolnames:
                    outfile.write(f'{displayed_name[toolname]}&\n')
    
                    port = len(results[test_category][toolname]['unimplemented'])
                    fail = len(results[test_category][toolname]['failure'])
                    othr = len(results[test_category][toolname]['other'])
                    tout = len(results[test_category][toolname]['timeout'])
                    TP = len(results[test_category][toolname]['TRUE_POS'])
                    TN = len(results[test_category][toolname]['TRUE_NEG'])
                    FN = len(results[test_category][toolname]['FALSE_NEG'])
                    FP = len(results[test_category][toolname]['FALSE_POS'])
    
                    total = TP + TN + FP + FN + port + fail + othr + tout
    
                    outfile.write(f"{bold_if(port, 0)}&{bold_if(tout, 0)}&{bold_if(fail + othr, 0)}")
                    outfile.write(
                        f"&{bold_if(TP, best['TP'])}&{bold_if(TN, best['TN'])}&{bold_if(FP, best['FP'])}&{bold_if(FN, best['FN'])}&")
    
                    # Coverage & Completion
                    coverage = percent(port, total, compl=True, one=True, digits=2)
                    completion = percent((port + fail + othr + tout), (total), compl=True, one=True, digits=2)
                    outfile.write(f"{bold_if(coverage, best['coverage'])} &{bold_if(completion, best['completion'])}&")
                    # Specificity: recognized {TN} correct codes out of {TN+FP}
                    specificity = percent(TN, (TN + FP), one=True, digits=2)
                    outfile.write(f'{bold_if(specificity, best["specificity"])}&')
                    # Recall: found {TP} errors out of {TP+FN} ;Precision: {TP} diagnostic of error are correct out of {TP+FP}) ;
                    recall = percent(TP, (TP + FN), one=True, digits=2)
                    precision = percent(TP, (TP + FP), one=True, digits=2)
                    outfile.write(f'{bold_if(recall, best["recall"])} & {bold_if(precision, best["precision"])} &')
                    # F1 Score
                    if TP + FP > 0 and TP + FN > 0:
                        precision = TP / (TP + FP)
                        recall = TP / (TP + FN)
                        F1 = percent(2 * precision * recall, (precision + recall), one=True, digits=2)
                        outfile.write(f'{bold_if(F1, best["F1"])}&')
                    else:
                        outfile.write('(error)&')
                    # Accuracy: {TP+TN} correct diagnostics in total, out of all tests {TP+TN+FP+FN+port+fail+othr+tout} diagnostics
                    accuracy = percent(TP + TN, (TP + TN + FP + FN + port + fail + othr + tout), one=True, digits=2)
                    outfile.write(f'{bold_if(accuracy, best["accuracy"])}')
    
                    outfile.write(f'\\\\\\hline\n')
                outfile.write(f'\\hline\n')
    
                outfile.write('\\textit{Ideal tool}&\\textit{0}&\\textit{0}&\\textit{0}&')
                outfile.write(
                    f"\\textit{{{len(results['total'][toolname]['error'])}}}&\\textit{{{len(results['total'][toolname]['OK'])}}}&\\textit{{0}}&\\textit{{0}}&")
                outfile.write(
                    "\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1} \\\\\\hline\n")
    
                outfile.write('\\end{tabular}\n')
                outfile.write('\\setlength\\tabcolsep{6pt} % Back to default value\n')
    
        # Produce the table with the metrics per tool per category (not used, as we put everything on one line only)
        with open(f'{rootdir}/latex/results-metrics.tex', 'w') as outfile:
            outfile.write('\\begin{tabular}{|l|*{7}{c|}}\\hline\n')
            outfile.write(
                '  \\multirow{2}{*}{ \\textbf{Tool}} &  \\multicolumn{2}{c|}{Robustness} &\\multicolumn{4}{c|}{Usefulness}&\\textbf{Overall}\\\\\\cline{2-7}\n')
    
            outfile.write(
                '  &  \\textbf{Coverage} & \\textbf{Conclusiveness} & \\textbf{Specificity}&\\textbf{Recall}& \\textbf{Precision}& \\textbf{F1 Score}    & \\textbf{accuracy}   \\\\\\hline \n')
    
            for toolname in used_toolnames:
                outfile.write(f'{displayed_name[toolname]}&\n')
    
                nPort = len(results['total'][toolname]['unimplemented'])
                nFail = len(results['total'][toolname]['failure']) + len(results['total'][toolname]['other'])
                nTout = len(results['total'][toolname]['timeout'])
                TP = len(results['total'][toolname]['TRUE_POS'])
                TN = len(results['total'][toolname]['TRUE_NEG'])
                FN = len(results['total'][toolname]['FALSE_NEG'])
                FP = len(results['total'][toolname]['FALSE_POS'])
    
                total = TP + TN + FP + FN + nTout + nPort + nFail
    
                # Coverage & Completion
                outfile.write(
                    f'{percent(nPort, total, compl=True, one=True)} &{percent((nTout + nFail + nPort), (total), compl=True, one=True)}&')
                # Specificity: recognized {TN} correct codes out of {TN+FP}
                outfile.write(f'{percent(TN, (TN + FP), one=True)}&')
                # Recall: found {TP} errors out of {TP+FN} ;Precision: {TP} diagnostic of error are correct out of {TP+FP}) ;
                outfile.write(f'{percent(TP, (TP + FN), one=True)} & {percent(TP, (TP + FP), one=True)} &')
                # F1 Score
                if TP + FP > 0 and TP + FN > 0:
                    precision = TN / (TP + FP)
                    recall = TP / (TP + FN)
                    outfile.write(f'{percent(2 * precision * recall, (precision + recall), one=True)}&')
                else:
                    outfile.write('(error)&')
                # Accuracy: {TP+TN} correct diagnostics in total, out of all tests {TP+TN+FP+FN+nTout+nFail+nPort} diagnostics
                outfile.write(f'{percent(TP + TN, (TP + TN + FP + FN + nTout + nFail + nPort), one=True)}')
                outfile.write(f'\\\\\\hline\n')
    
            outfile.write(
                "\\hline\n\\textit{Ideal tool}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}&\\textit{1}\\\\\\hline\n")
    
            outfile.write('\\end{tabular}\n')
    
        # Produce the timing results
        with open(f'{rootdir}/latex/results-timings.tex', 'w') as outfile:
            outfile.write(f"\\begin{{tabular}}{{|c|c|*{{{len(used_toolnames)}}}{{c|}}}}\n")
            outfile.write(f"\\cline{{3-{len(used_toolnames) + 2}}}\n")
            # First title line: Tool names
            outfile.write("  \\multicolumn{2}{c|}{}")
            for t in used_toolnames:
                outfile.write(f"& {displayed_name[t]}")
            outfile.write(f"\\\\\\hline\n")
    
            def show_line(key, display_name):
                outfile.write(f"\\multirow{{3}}{{*}}{{{display_name}}} & Mean time ")
                for toolname in used_toolnames:
                    if len(timing[key][toolname]) > 1:
                        mean = statistics.mean(timing[key][toolname])
                        outfile.write(f"&{round(mean, 2)}")
                    else:
                        outfile.write("&(error)")
                        print(f"Error while computing the mean of timing[{key}][{toolname}] (needs at least one value)")
                outfile.write(f"\\\\\\cline{{2-{len(used_toolnames) + 2}}}\n")
    
                outfile.write(f"& StdDev ")
                for toolname in used_toolnames:
                    if len(timing[key][toolname]) > 2:
                        stdev = statistics.stdev(timing[key][toolname])
                        outfile.write(f"&{round(stdev, 2)}")
                    else:
                        outfile.write("&(error)")
                        print(
                            f"Error while computing the variance of timing[{key}][{toolname}] (needs at least two values)")
                outfile.write(f"\\\\\\cline{{2-{len(used_toolnames) + 2}}}\n")
    
                outfile.write(f" & \\# timout ")
                for toolname in used_toolnames:
                    tout = len(results[key][toolname]['timeout'])
                    if tout == 0:
                        tout = '-'
                    outfile.write(f"&{tout}")
                outfile.write("\\\\\\hline\n")
    
            for error in error_scope:
                if error == 'FOK':
                    outfile.write('\\hline\n')
                    show_line('error', '\\textit{All incorrect tests}')
                    title = '\\textit{All correct tests}'
                else:
                    title = f"\\makecell{{{displayed_name[error]} \\\\ ({error_scope[error]})}}"
    
                show_line(error, title)
            outfile.write('\\hline\n')
            show_line('total', '\\textbf{All tests}')
    
            outfile.write(f"\\multicolumn{{2}}{{|c|}}{{\\textbf{{Total time}}}} ")
            for toolname in used_toolnames:
                secs = sum(timing['total'][toolname])
                days = int(secs // 86400)
                hours = int((secs - days * 86400) // 3600)
                minutes = int((secs - days * 86400 - hours * 3600) // 60)
                seconds = secs - days * 86400 - hours * 3600 - minutes * 60
                #            centi = int((seconds - int(seconds)*10)
                outfile.write("&")
                if hours > 0:
                    outfile.write(f"{hours}h")
                if hours > 0 or minutes > 0:
                    outfile.write(f"{minutes}m")
                outfile.write(f"{int(seconds)}s")
            outfile.write(f"\\\\\\hline\n")
    
            # Last line: Tool names again
            outfile.write("  \\multicolumn{2}{c|}{}")
            for t in used_toolnames:
                outfile.write(f"& {displayed_name[t]}")
            outfile.write(f"\\\\\\cline{{3-{len(used_toolnames) + 2}}}\n")
    
            outfile.write(f"\\end{{tabular}}\n")
    
        with open(f'{rootdir}/latex/files-count.tex', 'w') as outfile:
            files_results = categorize_all_files(tools[used_toolnames[0]], used_toolnames[0], todo, args.logs_dir)
    
            error_types = {}
            error_types_tests = {}
            for error in error_scope:
                error_types[error] = 0
                error_types_tests[error] = 0
    
            # Count number of code by expected type of results
            for f in files_results:
                error_types[possible_details[files_results[f]['detail']]] += 1
    
            for t in todo:
                error_types_tests[possible_details[t['detail']]] += 1
    
            outfile.write("\\begin{tabular}{|l|c|c|}\n")
            outfile.write("  \\hline\n")
            outfile.write("  \\textbf{Error category} & \\textbf{Number of codes} & \\textbf{Number of tests}\\\\\n")
            outfile.write("  \\hline\n")
            for et in error_types:
                if et in ['BLocalConcurrency', 'DRace', 'DGlobalConcurrency',
                          'EBufferingHazard', 'InputHazard']:
                    outfile.write(
                        f"  \\textbf{{{displayed_name[et]}}} & \\textbf{{{error_types[et]}}}& \\textbf{{{error_types_tests[et]}}} \\\\\n")
                else:
                    outfile.write(f"  \\textit{{{displayed_name[et]}}} & {error_types[et]} & {error_types_tests[et]}\\\\\n")
    
            outfile.write("  \\hline\n")
            outfile.write(f"  \\textbf{{Total}} & {len(files_results)} & {len(todo)}\\\\\n")
            outfile.write("  \\hline\n")
            outfile.write("\\end{tabular}\n")
    
        def resultsPerCategory(suffix, hazard=False):
            category = ['FOK', 'AInvalidParam', 'BResLeak', 'DMatch', 'CMatch', 'BReqLifecycle', 'BEpochLifecycle']
            if hazard:
                category = ['BLocalConcurrency', 'DGlobalConcurrency', 'DRace', 'EBufferingHazard', 'InputHazard']
    
            with open(f'{rootdir}/latex/nd-results-per-category-portrait-{suffix}.tex', 'w') as outfile:
                # files_results = categorize_all_files(tools[used_toolnames[0]], used_toolnames[0], todo)
                ext_results = {}
                best = {}
    
                # Put FOK at the first position
                last = ''
                for e in category:
                    last = e
                    best[e] = {
                        'TP': 0, 'TN': 0, 'FP': 99999, 'FN': 99999,
                        'SE': 99999,
                        'accp': 0, 'accm': 0
                    }
    
                for toolname in used_toolnames:
                    ext_results[toolname] = {}
    
                    files_results = categorize_all_files(tools[toolname], toolname, todo, args.logs_dir)
                    for error in category:
                        ext_results[toolname][error] = {
                            'TP': [], 'TN': [], 'FP': [], 'FN': [],
                            'CE': [], 'TO': [], 'RE': [], 'O': [], 'SE': [],
                            'accp': 0, 'accm': 0,
                            'total': 0
                        }
    
                        for f in files_results:
                            if possible_details[files_results[f]['detail']] == error:
                                ext_results[toolname][error][files_results[f]['result']].append(f)
                                ext_results[toolname][error]['total'] += 1
    
                        total = ext_results[toolname][error]['total']
                        # accp = round((len(ext_results[toolname][error]['TP']) + len(ext_results[toolname][error]['TN']) + len(ext_results[toolname][error]['TP'])) / total, 2)
                        # accm = round((len(ext_results[toolname][error]['TP']) + len(ext_results[toolname][error]['TN'])) / total, 2)
    
                        ext_results[toolname][error]['accp'] = 0  # accp
                        ext_results[toolname][error]['accm'] = 0  # accm
    
                    for error in category:
                        err = (len(ext_results[toolname][error]['CE'])
                               + len(ext_results[toolname][error]['TO'])
                               + len(ext_results[toolname][error]['RE'])
                               + len(ext_results[toolname][error]['O'])
                               + len(ext_results[toolname][error]['SE']))
    
                        if best[error]['SE'] > err:
                            best[error]['SE'] = err
    
                        for res in ['FP', 'FN']:
                            if best[error][res] > len(ext_results[toolname][error][res]):
                                best[error][res] = len(ext_results[toolname][error][res])
    
                        for res in ['TP', 'TN']:
                            if best[error][res] < len(ext_results[toolname][error][res]):
                                best[error][res] = len(ext_results[toolname][error][res])
    
                        for res in ['accp', 'accm']:
                            if best[error][res] < ext_results[toolname][error][res]:
                                best[error][res] = ext_results[toolname][error][res]
    
                ncol = 4 if not hazard else 6
                align = 'c|c|c|c|' if not hazard else 'c|c|c|c|c|c|'
    
                outfile.write("\\setlength\\tabcolsep{1.5pt}\n")
                outfile.write(f"\\begin{{tabular}}{{|l|*{{{len(category) - 1}}}{{ {align} |}} {align}}}\n")
                outfile.write(f"\\cline{{2- {(len(category) * ncol) + 1} }}\n")
    
                outfile.write("  \\multicolumn{1}{c|}{}")
                for error in category:
                    if error == last:
                        outfile.write(f" & \\multicolumn{{{ncol}}}{{c|}}")
                    else:
                        outfile.write(f" & \\multicolumn{{{ncol}}}{{c||}}")
    
                    outfile.write(f"{{\\it {displayed_name[error].split()[0]}}}")
    
                outfile.write("\\\\\n")
    
                outfile.write("  \\multicolumn{1}{c|}{}")
                for error in category:
                    if error == last:
                        outfile.write(f" & \\multicolumn{{{ncol}}}{{c|}}")
                    else:
                        outfile.write(f" & \\multicolumn{{{ncol}}}{{c||}}")
    
                    outfile.write(f"{{\\it {displayed_name[error].split()[1]}}}")
    
                outfile.write("\\\\\n")
                outfile.write(f"\\cline{{2- {(len(category) * ncol) + 1} }}\n")
    
                outfile.write("  \\multicolumn{1}{c|}{}")
                for error in category:
                    outfile.write(" & \\rotatebox{90}{SE}")
                    if error == "FOK":
                        outfile.write(" & \\rotatebox{90}{{\\bf TN}}")
                        if hazard:
                            outfile.write(" & \\rotatebox{90}{{\\bf FP}}")
                        outfile.write(" & \\rotatebox{90}{{\\bf FP}}")
                    else:
                        outfile.write(" & \\rotatebox{90}{TP}")
                        if hazard:
                            outfile.write(" & \\rotatebox{90}{TP}")
                        outfile.write(" & \\rotatebox{90}{FN}")
    
                    if hazard:
                        outfile.write(" & \\rotatebox{90}{Accuracy\\textsuperscript{+}}")
                        outfile.write(" & \\rotatebox{90}{Accuracy\\textsuperscript{-}}")
                    else:
                        outfile.write(" & \\rotatebox{90}{Accuracy}")
    
                outfile.write("\\\\\\hline\n")
    
                for toolname in used_toolnames:
                    outfile.write(f"{displayed_name[toolname]}")
    
                    for error in category:
                        disp_err = (len(ext_results[toolname][error]['CE'])
                                    + len(ext_results[toolname][error]['TO'])
                                    + len(ext_results[toolname][error]['RE'])
                                    + len(ext_results[toolname][error]['O'])
                                    + len(ext_results[toolname][error]['SE']))
    
                        if disp_err == best[error]['SE']:
                            outfile.write(f"& {{\\bf {disp_err}}}")
                        else:
                            outfile.write(f"& {disp_err}")
    
                        format_if_best = lambda res: f" & {{\\bf {len(ext_results[toolname][error][res])}}}" if best[error][
                                                                                                                    res] == len(
                            ext_results[toolname][error][res]) else f" & {len(ext_results[toolname][error][res])}"
    
                        format_if_best_2 = lambda \
                                res: f" & {{\\bf {1 if ext_results[toolname][error][res] >= 1.0 else 0 if ext_results[toolname][error][res] <= 0.0 else ext_results[toolname][error][res]} }}" if \
                            best[error][res] == ext_results[toolname][error][
                                res] else f" & {1 if ext_results[toolname][error][res] >= 1.0 else 0 if ext_results[toolname][error][res] <= 0.0 else ext_results[toolname][error][res]}"
    
                        if error == "FOK":
                            outfile.write(format_if_best('TN'))
                            if hazard:
                                outfile.write(format_if_best('FP'))
                            outfile.write(format_if_best('FP'))
                        else:
                            outfile.write(format_if_best('TP'))
                            if hazard:
                                outfile.write(format_if_best('TP'))
                            outfile.write(format_if_best('FN'))
    
                        if hazard:
                            outfile.write(format_if_best_2('accp'))
                            outfile.write(format_if_best_2('accm'))
                        else:
                            outfile.write(format_if_best_2('accp'))
    
                    outfile.write("\\\\\\hline\n")
    
                outfile.write("\\textit{Ideal tool}")
    
                for error in category:
                    outfile.write(" & \\textit{0}")
                    outfile.write(f" & \\textit{{ {ext_results[toolname][error]['total']} }}")
                    if hazard:
                        outfile.write(" & \\textit{0}")
                    outfile.write(" & \\textit{0}")
                    outfile.write(" & \\textit{1}")
                    if hazard:
                        outfile.write(" & \\textit{1}")
    
                outfile.write("\\\\\\hline\n")
    
                outfile.write("\\end{tabular}\n")
                outfile.write("\\setlength\\tabcolsep{6pt}")
    
        resultsPerCategory('deter', hazard=False)
        resultsPerCategory('ndeter', hazard=True)
    
        with open(f'{rootdir}/latex/reclassified-result.tex', 'w') as outfile:
            reclassified = {}
    
            category = ['FOK']
            last = ''
            for e in error_scope:
                if e != 'FOK':
                    category.append(e)
                    last = e
            category.append('total')
    
            for toolname in used_toolnames:
                reclassified[toolname] = {}
    
                for e in category:
                    reclassified[toolname][e] = []
    
                for test in todo:
                    binary = re.sub('\\.c', '', os.path.basename(test['filename']))
                    ID = test['id']
                    test_id = f"{binary}_{ID}"
    
                    (res_category, elapsed, diagnostic, outcome) = categorize(tool=tools[toolname], toolname=toolname,
                                                                              test_id=test_id, logs_dir=args.logs_dir,
                                                                              expected=test['expect'], autoclean=False)
    
                    if not tools[toolname].is_correct_diagnostic(test_id, res_category, test['expect'], test['detail']):
                        reclassified[toolname][possible_details[test['detail']]].append(test_id)
                        reclassified[toolname]['total'].append(test_id)
    
            outfile.write("\\begin{tabular}{|l|")
            for e in category:
                outfile.write("c|")
            outfile.write("}\n")
            outfile.write("  \\hline\n")
    
            # Column title
            outfile.write("  ")
            for e in category:
                if e != 'total':
                    outfile.write(f" &\\textit{{ {displayed_name[e].split()[0]} }}")
                else:
                    outfile.write(" & ")
    
            outfile.write("  \\\\\n")
    
            outfile.write("  \\textbf{Tools}")
            for e in category:
                if e != 'total':
                    outfile.write(f" &\\textit{{ {displayed_name[e].split()[1]} }}")
                else:
                    outfile.write(" & \\textbf{Total}")
    
            outfile.write("\\\\\n")
            outfile.write("  \\hline\n")
    
            # Results
            for toolname in used_toolnames:
                outfile.write(f"  {displayed_name[toolname]}")
                for e in category:
                    res = len(reclassified[toolname][e])
                    if res > 0:
                        outfile.write(f" & \\textbf{{ {res} }}")
                    else:
                        outfile.write(f" & {res}")
    
                outfile.write("  \\\\\\hline\n")
    
            outfile.write("\\end{tabular}\n")
    
        files = get_C_files_from_dir(f"{rootdir}/scripts/gencodes/")
    
        generate_errors(files, f"{rootdir}/latex/errors.tex")
        generate_labels(files, f"{rootdir}/latex/labels.tex")
        generate_features(files, f"{rootdir}/latex/features.tex")
    
        os.chdir(here)
    
    
    def get_overview_plot(data, outpath,scrutiny="base"):
        assert len(data) > 0
        # get the column names = all the metrics tht are calculated
        df_first_tool=    next(iter(data.values()))
        df_first_tool["TP"] = df_first_tool[f"TP_{scrutiny}"]
        df_first_tool = classify_tests(df_first_tool)
        df_first_tool = aggregate_metrics_per_category(df_first_tool)
        cols = df_first_tool.columns
    
        df_coll = pd.DataFrame(columns=cols)
        df_rma= pd.DataFrame(columns=cols)
        df_p2p = pd.DataFrame(columns=cols)
        df_total = pd.DataFrame(columns=cols)
    
        for toolname, df in data.items():
            df["TP"] = df[f"TP_{scrutiny}"]
            df = classify_tests(df)
            df = aggregate_metrics_per_category(df)
            df_coll.loc[toolname] = df.loc["COLL"]
            df_rma.loc[toolname] = df.loc["RMA"]
            df_p2p.loc[toolname] = df.loc["P2P"]
            df_total.loc[toolname] = df.loc["ALL"]
    
    
        SMALL_SIZE = 20
        MEDIUM_SIZE = 22
        BIGGER_SIZE = 24
    
        plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
        plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
        plt.rc('axes', labelsize=SMALL_SIZE)  # fontsize of the x and y labels
        plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
        plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
        fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 9))  #
    
        # colors = ['#228833', '#66ccee', '#ee6677', '#aa3377', '#ccbb44', '#bbbbbb']
        colors = ['#6699CC', '#EECC66', '#004488', '#997700', '#BBBBBB', '#000000']
    
        ((ax1, ax2), (ax3, ax4)) = axs
        df_p2p[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax1, legend=False, color=colors)
        ax1.set_title('P2P')
        handles, labels = ax1.get_legend_handles_labels()
    
        df_coll[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax2, legend=False, color=colors)
        ax2.set_title('Collective')
        ax2.yaxis.tick_right()
        # Set the y-axis labels to uppercase
        ax2.set_yticklabels([label.get_text().upper() for label in ax2.get_yticklabels()])
    
        df_rma[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax3, legend=False, color=colors)
        ax3.set_title('RMA')
    
        df_total[["TP", "TN", "FP", "FN", "RE", "CE"]].plot.barh(stacked=True, ax=ax4, legend=False, color=colors)
        ax4.set_title('Total')
        ax4.yaxis.tick_right()
    
        for ax in [ax1, ax2, ax3, ax4]:
            ax.set_ylabel('')
            # Set the y-axis labels to uppercase
            ax.set_yticklabels([label.get_text().upper() for label in ax.get_yticklabels()])
    
        fig.legend(handles, labels, loc='upper center', ncols=6, bbox_to_anchor=(0.5, 1.05), )
    
        plt.tight_layout()
    
        plt.savefig(os.path.join(outpath, "overview_per_cat.pdf"), bbox_inches="tight")
    
    
    ########################
    # cmd_plots(): what to do when '-c plots' is used (extract the statistics of this tool)
    ########################
    
    def cmd_plots(rootdir, toolnames, ext="pdf"):
        here = os.getcwd()
        os.chdir(rootdir)
        os.makedirs('plots', exist_ok=True)
        outpath = f'{rootdir}/plots/'
    
        collected_data = {}
        for toolname in toolnames:
            df = read_tool_reports(rootdir, toolname)
            plot_helpfulness(df, outpath, toolname)
            collected_data[toolname]=df
    
        get_overview_plot(collected_data,outpath)
    
        os.chdir(here)
    
    
    ########################
    # Main script argument parsing
    ########################
    
    parser = argparse.ArgumentParser(
        description='This runner intends to provide a bridge from a MPI compiler/executor + a test written with MPI bugs collection header and the actual result compared to the expected.')
    
    parser.add_argument('-c', metavar='cmd', default='all',
                        help="The command you want to execute. By default, 'all', runs all commands in sequence. Other choices:\n"
                             "  generate: redo all the test codes.\n"
                             "  latex: Produce the LaTeX tables we need for the article, using the cached values from a previous 'run'.\n"
                             "  csv: Produce CSV output.\n"
                             "  run: run the tests on all codes.\n"
                             "  html: produce the HTML statistics, using the cached values from a previous 'run'.\n"
                             "  plots: produce the plots images, using the cached values from a previous 'run'.\n")
    
    parser.add_argument('-x', metavar='tool', default='mpirun',
                        help='the tool you want at execution: one among [aislinn, civl, isp, mpisv, must, simgrid, parcoach]')
    
    parser.add_argument('-t', '--timeout', metavar='int', default=300, type=int,
                        help='timeout value at execution time, given in seconds (default: %(default)s)')
    
    parser.add_argument('-n', '--nworkers', metavar='int', default=1, type=int,
                        help='size of the pool of workers that execute the tests in parallel (default: 1)')
    
    parser.add_argument('-l', '--logs-dir', metavar='path', default="logs", type=pathlib.Path,
                        help='path to output directory of logs (default: $PWD/logs)')
    
    parser.add_argument('-g', '--gencodes', metavar='path', default="gencodes", type=pathlib.Path,
                        help='path to directory of source files (default: gencodes)')
    
    parser.add_argument('-lev', '--level', metavar='int', default=2, type=int,
                        help='Generation level to generate codes (default: 2)')
    
    parser.add_argument('-b', metavar='batch', default='1/1',
                        help="Limits the test executions to the batch #N out of M batches (Syntax: 'N/M'). To get 3 runners, use 1/3 2/3 3/3")
    
    parser.add_argument('-f', metavar='format', default='pdf',
                        help="Format of output images [pdf, svg, png, ...] (only for 'plots' command)")
    
    parser.add_argument('-v', '--verbose', action="store_const", dest="loglevel", const=logging.DEBUG, default=logging.INFO)
    
    args = parser.parse_args()
    rootdir = os.path.dirname(os.path.abspath(__file__))
    
    # Parameter checking: Did we get a valid tool to use?
    arg_tools = []
    if args.c == 'all' or args.c == 'run':
        if args.x == 'mpirun':
            raise Exception(
                "No tool was provided, please retry with -x parameter. (see -h for further information on usage)")
        elif args.x in tools:
            arg_tools = [args.x]
        elif ',' in args.x:
            for x in args.x.split(','):
                if x not in tools:
                    raise Exception(f"The tool parameter you provided ({x}) is either incorect or not yet implemented.")
                arg_tools.append(x)
        else:
            raise Exception(f"The tool parameter you provided ({args.x}) is either incorect or not yet implemented.")
    elif ',' in args.x:
        for x in args.x.split(','):
            if x not in tools:
                raise Exception(f"The tool parameter you provided ({x}) is either incorect or not yet implemented.")
        arg_tools = args.x.split(',')
    else:
        arg_tools = [args.x]
    
    print(f'arg_tools: {arg_tools}')
    
    if args.c == 'all':
        extract_all_todo(args.b)
        cmd_run(rootdir=rootdir, toolname=args.x, batchinfo=args.b)
        cmd_html(rootdir, toolnames=arg_tools)
    elif args.c == 'generate':
        if args.level:
            cmd_gencodes(level=args.level)
        else:
            cmd_gencodes(level=2)
    elif args.c == 'build':
        for t in arg_tools:
            cmd_build(rootdir=rootdir, toolname=t)
    elif args.c == 'run':
        extract_all_todo(args.b)
        for t in arg_tools:
            cmd_run(rootdir=rootdir, toolname=t, batchinfo=args.b)
    elif args.c == 'latex':
        extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
        # 'smpi','smpivg' are not shown in the paper
        # cmd_latex(rootdir, toolnames=['aislinn', 'civl', 'isp','itac', 'simgrid', 'mpisv', 'must', 'hermes', 'parcoach', 'mpi-checker'])
        cmd_latex(rootdir, toolnames=['itac', 'must', 'parcoach'])
    elif args.c == 'csv':
        extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
        if arg_tools:
            cmd_csv(rootdir, toolnames=arg_tools)
        else:
            cmd_csv(rootdir, toolnames=['itac', 'must', 'parcoach'])
    elif args.c == 'html':
        extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
        if args.x == 'mpirun':
            # toolnames=['itac', 'simgrid','must', 'smpi', 'smpivg', 'aislinn', 'civl', 'isp', 'mpisv', 'parcoach', 'hermes', 'mpi-checker']
            toolnames = ['itac', 'must', 'parcoach']
        else:
            toolnames = arg_tools
        # Build SVG plots
        # if plots_loaded:
        #     cmd_plots(rootdir, toolnames=toolnames, ext="svg")
        # Build HTML page
        cmd_html(rootdir, toolnames=toolnames)
    elif args.c == 'plots':
        if not plots_loaded:
            print("[MBB] Error: Dependancies ('numpy' or 'matplotlib') are not available!")
            exit(-1)
        extract_all_todo_from_logdir(arg_tools[0], args.logs_dir)
        if args.x == 'mpirun':
            # toolnames=['itac', 'simgrid', 'must', 'aislinn', 'civl', 'isp', 'mpisv', 'parcoach', 'hermes', 'mpi-checker']
            toolnames = ['itac', 'must', 'parcoach']
        else:
            toolnames = arg_tools
    
        cmd_plots(rootdir, toolnames=toolnames, ext=args.f)
    else:
        print(
            # f"Invalid command '{args.c}'. Please choose one of 'all', 'generate', 'build', 'run', 'html' 'latex' or 'plots'")
            # We should remove latex and plots and update generate
            f"Invalid command '{args.c}'. Please choose one of 'all', 'generate', 'build', 'run', 'csv' 'latex' or 'plots'")
        sys.exit(1)