diff --git a/rtrg_c.c b/rtrg_c.c
index cac57d6ebeb97e4cb9fc2c66bd59c453c6f331b5..b709458ea343ebcf5e2d2346a1e7692b84a8b751 100644
--- a/rtrg_c.c
+++ b/rtrg_c.c
@@ -27,6 +27,7 @@ SOFTWARE.
  * @section sec_config Configuration
  *
  * The following variables can be defined:
+ * * MKL:   if defined, use intel MKL for CBLAS and LAPACK C header
  * * CBLAS: if defined, use CBLAS instead of directly calling BLAS functions
  * * LAPACK_C: include LAPACK C header instead of just linking to LAPACK
  * * PARALLEL_EXTRA_DIMS: use OpenMP to parallelize repeated operations over
@@ -83,9 +84,20 @@ SOFTWARE.
 #define NPY_COMPLEX_TYPE NPY_COMPLEX128
 #define lapack_complex_double complex_type
 
+#ifdef MKL
+
+#include <mkl_lapack.h>
+#include <mkl_cblas.h>
+#define getrf zgetrf
+#define getri zgetri
+#define gemm cblas_zgemm
+#define trmm cblas_ztrmm
+#define CBLAS
+
+#else /* MKL */
+
 #ifdef LAPACK_C
 #include <lapack.h>
-//#include <mkl_lapack.h>
 #define getrf LAPACK_zgetrf
 #define getri LAPACK_zgetri
 #else /* LAPACK_C */
@@ -97,7 +109,6 @@ extern void zgetri_(const int*, double complex*, const int*, const int*, complex
 
 #ifdef CBLAS
 #include <cblas.h>
-//#include <mkl_cblas.h>
 #define gemm cblas_zgemm
 #define trmm cblas_ztrmm
 #else /* CBLAS */
@@ -108,6 +119,8 @@ extern void ztrmm_(const char*, const char*, const char*, const char*, const int
 static const char N='N', L='L', R='R', U='U';
 #endif /* CBLAS */
 
+#endif /* MKL */
+
 static const complex_type zero = 0.;
 static const complex_type one = 1.;
 
diff --git a/setup.py b/setup.py
index ed9aeb948a66af8aec0836bdbb554638d589df33..9890523586da59d7e6f4a3f700921e3ab2692091 100644
--- a/setup.py
+++ b/setup.py
@@ -9,19 +9,28 @@ def main():
     compiler_args = ['-O3','-Wall','-Wextra','-std=c11']
     linker_args = []
     include_dirs = [np.get_include()]
+    library_dirs = []
     name = 'rtrg_c'
     sources = ['rtrg_c.c']
-    libraries = ['lapack']
+    libraries = []
 
-    if 'CBLAS' in environ:
+    if 'MKL' in environ:
+        libraries += ['mkl_rt']
+        include_dirs += ["/opt/intel/mkl/include"]
+        library_dirs += ["/opt/intel/mkl/lib/intel64"]
         compiler_args += ['-DCBLAS']
-        libraries += ['cblas']
-        #libraries += ['mkl_rt']
+        compiler_args += ['-DMKL']
     else:
-        libraries += ['blas']
+        libraries += ['lapack']
+        if 'CBLAS' in environ:
+            compiler_args += ['-DCBLAS']
+            libraries += ['cblas']
+            #libraries += ['mkl_rt']
+        else:
+            libraries += ['blas']
 
-    if 'LAPACK_C' in environ:
-        compiler_args += ['-DLAPACK_C']
+        if 'LAPACK_C' in environ:
+            compiler_args += ['-DLAPACK_C']
 
     parallel_modifiers = ('PARALLEL', 'PARALLEL_EXTRAPOLATION', 'PARALLEL_EXTRA_DIMS')
 
@@ -45,6 +54,7 @@ def main():
             name,
             sources = sources,
             include_dirs = include_dirs,
+            library_dirs = library_dirs,
             libraries = libraries,
             extra_compile_args = compiler_args,
             extra_link_args = linker_args
diff --git a/test/benchmark.py b/test/benchmark.py
index 90b470131c81e315f380333c8249750699b3e740..982e6c017f968baef68be076997dad79d7ff32e9 100644
--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -21,24 +21,28 @@ def benchmark_kondo(parameters, solver_opts, save=False):
     kondo = Kondo(**parameters)
     pr = cProfile.Profile()
     pr.enable()
-    kondo.run(**solver_opts)
+    try:
+        kondo.run(**solver_opts)
+    except KeyboardInterrupt as exception:
+        settings.logger.exception("Benchmark interrupted!")
     pr.disable()
     ps = pstats.Stats(pr).sort_stats(SortKey.CUMULATIVE)
     return kondo, ps
 
 def main(list_length=100):
-    solver_opts = dict(rtol=1e-8, atol=1e-10)
+    solver_opts = dict(rtol=1e-6, atol=1e-8)
     parameter_dicts = [
-            dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0, include_Ga=True, solve_integral_exactly=True, integral_method=-1),
-            dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0),
-            dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation= True, padding= 0, compact=0),
+            #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0, include_Ga=True, solve_integral_exactly=True, integral_method=-1),
+            #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0),
+            #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation= True, padding= 0, compact=0),
             #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation= True, padding= 8, compact=0),
             #dict(nmax=24, voltage_branches=4, omega=10., d=1e9, vac=12., vdc=6., unitary_transformation=False, padding= 0, compact=0),
             #dict(nmax=24, voltage_branches=4, omega=10., d=1e9, vac=12., vdc=6., unitary_transformation= True, padding= 0, compact=0),
             #dict(nmax=24, voltage_branches=4, omega=10., d=1e9, vac=12., vdc=6., unitary_transformation= True, padding=16, compact=0),
-            dict(nmax=24, voltage_branches=0, omega=10., d=1e9, vac=12., vdc=0., unitary_transformation= True, padding=16, compact=0),
-            dict(nmax=32, voltage_branches=0, omega=10., d=1e9, vac=20., vdc=0., unitary_transformation= True, padding=24, compact=2),
-            #dict(nmax=128, voltage_branches=0, omega=10., d=1e9, vac=150., vdc=0., unitary_transformation=True, padding=80, compact=2),
+            #dict(nmax=24, voltage_branches=0, omega=10., d=1e9, vac=12., vdc=0., unitary_transformation= True, padding=16, compact=0),
+            #dict(nmax=32, voltage_branches=0, omega=10., d=1e9, vac=20., vdc=0., unitary_transformation= True, padding=24, compact=2),
+            #dict(nmax=128, voltage_branches=0, omega=10., d=1e6, vac=150., vdc=0., unitary_transformation=True, padding=80, compact=2),
+            dict(nmax=400, voltage_branches=0, omega=10., d=1e6, vac=250., vdc=0., unitary_transformation=True, padding=600, compact=2, include_Ga=True),
             ]
     for parameters in parameter_dicts:
         settings.logger.info("Starting:")
diff --git a/test/benchmark_rtrg_c.py b/test/benchmark_rtrg_c.py
index 5860ec465257af827734a373675e25723aff48eb..067f382a58ef58808c72a95372ea0e567f5d1751 100644
--- a/test/benchmark_rtrg_c.py
+++ b/test/benchmark_rtrg_c.py
@@ -3,7 +3,7 @@
 Benchmark for matrix multiplication from rtrg_c.
 """
 
-from test import *
+from test_rtrg_c import *
 from timeit import timeit
 from time import process_time
 
@@ -162,11 +162,13 @@ def benchmark_sym(
 
 
 if __name__ == '__main__':
+    # For large matrices intel MKL is faster than openBLAS
     benchmark(    shots=5, runs=10, n=2000, k=2000, m=2000, p=1600)
     benchmark_sym(shots=5, runs=10, n=2000, k=2000, m=2000, p=1600)
     benchmark(    shots=10, runs=10, n=1000, k=1000, m=1000, p=600)
     benchmark_sym(shots=10, runs=10, n=1000, k=1000, m=1000, p=600)
     benchmark(    shots=10, runs=10, n=200, k=200, m=200, p=150, extra_dims=(15,))
     benchmark_sym(shots=10, runs=10, n=200, k=200, m=200, p=150, extra_dims=(15,))
+    # For these parameters openBLAS can be faster than intel MKL
     benchmark(    shots=1000, runs=30, n=21, k=21, m=21, p=12, extra_dims=(7,))
     benchmark_sym(shots=1000, runs=30, n=21, k=21, m=21, p=12, extra_dims=(7,))