diff --git a/rtrg_c.c b/rtrg_c.c index cac57d6ebeb97e4cb9fc2c66bd59c453c6f331b5..b709458ea343ebcf5e2d2346a1e7692b84a8b751 100644 --- a/rtrg_c.c +++ b/rtrg_c.c @@ -27,6 +27,7 @@ SOFTWARE. * @section sec_config Configuration * * The following variables can be defined: + * * MKL: if defined, use intel MKL for CBLAS and LAPACK C header * * CBLAS: if defined, use CBLAS instead of directly calling BLAS functions * * LAPACK_C: include LAPACK C header instead of just linking to LAPACK * * PARALLEL_EXTRA_DIMS: use OpenMP to parallelize repeated operations over @@ -83,9 +84,20 @@ SOFTWARE. #define NPY_COMPLEX_TYPE NPY_COMPLEX128 #define lapack_complex_double complex_type +#ifdef MKL + +#include <mkl_lapack.h> +#include <mkl_cblas.h> +#define getrf zgetrf +#define getri zgetri +#define gemm cblas_zgemm +#define trmm cblas_ztrmm +#define CBLAS + +#else /* MKL */ + #ifdef LAPACK_C #include <lapack.h> -//#include <mkl_lapack.h> #define getrf LAPACK_zgetrf #define getri LAPACK_zgetri #else /* LAPACK_C */ @@ -97,7 +109,6 @@ extern void zgetri_(const int*, double complex*, const int*, const int*, complex #ifdef CBLAS #include <cblas.h> -//#include <mkl_cblas.h> #define gemm cblas_zgemm #define trmm cblas_ztrmm #else /* CBLAS */ @@ -108,6 +119,8 @@ extern void ztrmm_(const char*, const char*, const char*, const char*, const int static const char N='N', L='L', R='R', U='U'; #endif /* CBLAS */ +#endif /* MKL */ + static const complex_type zero = 0.; static const complex_type one = 1.; diff --git a/setup.py b/setup.py index ed9aeb948a66af8aec0836bdbb554638d589df33..9890523586da59d7e6f4a3f700921e3ab2692091 100644 --- a/setup.py +++ b/setup.py @@ -9,19 +9,28 @@ def main(): compiler_args = ['-O3','-Wall','-Wextra','-std=c11'] linker_args = [] include_dirs = [np.get_include()] + library_dirs = [] name = 'rtrg_c' sources = ['rtrg_c.c'] - libraries = ['lapack'] + libraries = [] - if 'CBLAS' in environ: + if 'MKL' in environ: + libraries += ['mkl_rt'] + include_dirs += ["/opt/intel/mkl/include"] + library_dirs += ["/opt/intel/mkl/lib/intel64"] compiler_args += ['-DCBLAS'] - libraries += ['cblas'] - #libraries += ['mkl_rt'] + compiler_args += ['-DMKL'] else: - libraries += ['blas'] + libraries += ['lapack'] + if 'CBLAS' in environ: + compiler_args += ['-DCBLAS'] + libraries += ['cblas'] + #libraries += ['mkl_rt'] + else: + libraries += ['blas'] - if 'LAPACK_C' in environ: - compiler_args += ['-DLAPACK_C'] + if 'LAPACK_C' in environ: + compiler_args += ['-DLAPACK_C'] parallel_modifiers = ('PARALLEL', 'PARALLEL_EXTRAPOLATION', 'PARALLEL_EXTRA_DIMS') @@ -45,6 +54,7 @@ def main(): name, sources = sources, include_dirs = include_dirs, + library_dirs = library_dirs, libraries = libraries, extra_compile_args = compiler_args, extra_link_args = linker_args diff --git a/test/benchmark.py b/test/benchmark.py index 90b470131c81e315f380333c8249750699b3e740..982e6c017f968baef68be076997dad79d7ff32e9 100644 --- a/test/benchmark.py +++ b/test/benchmark.py @@ -21,24 +21,28 @@ def benchmark_kondo(parameters, solver_opts, save=False): kondo = Kondo(**parameters) pr = cProfile.Profile() pr.enable() - kondo.run(**solver_opts) + try: + kondo.run(**solver_opts) + except KeyboardInterrupt as exception: + settings.logger.exception("Benchmark interrupted!") pr.disable() ps = pstats.Stats(pr).sort_stats(SortKey.CUMULATIVE) return kondo, ps def main(list_length=100): - solver_opts = dict(rtol=1e-8, atol=1e-10) + solver_opts = dict(rtol=1e-6, atol=1e-8) parameter_dicts = [ - dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0, include_Ga=True, solve_integral_exactly=True, integral_method=-1), - dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0), - dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation= True, padding= 0, compact=0), + #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0, include_Ga=True, solve_integral_exactly=True, integral_method=-1), + #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation=False, padding= 0, compact=0), + #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation= True, padding= 0, compact=0), #dict(nmax=12, voltage_branches=4, omega=10., d=1e9, vac= 5., vdc=4., unitary_transformation= True, padding= 8, compact=0), #dict(nmax=24, voltage_branches=4, omega=10., d=1e9, vac=12., vdc=6., unitary_transformation=False, padding= 0, compact=0), #dict(nmax=24, voltage_branches=4, omega=10., d=1e9, vac=12., vdc=6., unitary_transformation= True, padding= 0, compact=0), #dict(nmax=24, voltage_branches=4, omega=10., d=1e9, vac=12., vdc=6., unitary_transformation= True, padding=16, compact=0), - dict(nmax=24, voltage_branches=0, omega=10., d=1e9, vac=12., vdc=0., unitary_transformation= True, padding=16, compact=0), - dict(nmax=32, voltage_branches=0, omega=10., d=1e9, vac=20., vdc=0., unitary_transformation= True, padding=24, compact=2), - #dict(nmax=128, voltage_branches=0, omega=10., d=1e9, vac=150., vdc=0., unitary_transformation=True, padding=80, compact=2), + #dict(nmax=24, voltage_branches=0, omega=10., d=1e9, vac=12., vdc=0., unitary_transformation= True, padding=16, compact=0), + #dict(nmax=32, voltage_branches=0, omega=10., d=1e9, vac=20., vdc=0., unitary_transformation= True, padding=24, compact=2), + #dict(nmax=128, voltage_branches=0, omega=10., d=1e6, vac=150., vdc=0., unitary_transformation=True, padding=80, compact=2), + dict(nmax=400, voltage_branches=0, omega=10., d=1e6, vac=250., vdc=0., unitary_transformation=True, padding=600, compact=2, include_Ga=True), ] for parameters in parameter_dicts: settings.logger.info("Starting:") diff --git a/test/benchmark_rtrg_c.py b/test/benchmark_rtrg_c.py index 5860ec465257af827734a373675e25723aff48eb..067f382a58ef58808c72a95372ea0e567f5d1751 100644 --- a/test/benchmark_rtrg_c.py +++ b/test/benchmark_rtrg_c.py @@ -3,7 +3,7 @@ Benchmark for matrix multiplication from rtrg_c. """ -from test import * +from test_rtrg_c import * from timeit import timeit from time import process_time @@ -162,11 +162,13 @@ def benchmark_sym( if __name__ == '__main__': + # For large matrices intel MKL is faster than openBLAS benchmark( shots=5, runs=10, n=2000, k=2000, m=2000, p=1600) benchmark_sym(shots=5, runs=10, n=2000, k=2000, m=2000, p=1600) benchmark( shots=10, runs=10, n=1000, k=1000, m=1000, p=600) benchmark_sym(shots=10, runs=10, n=1000, k=1000, m=1000, p=600) benchmark( shots=10, runs=10, n=200, k=200, m=200, p=150, extra_dims=(15,)) benchmark_sym(shots=10, runs=10, n=200, k=200, m=200, p=150, extra_dims=(15,)) + # For these parameters openBLAS can be faster than intel MKL benchmark( shots=1000, runs=30, n=21, k=21, m=21, p=12, extra_dims=(7,)) benchmark_sym(shots=1000, runs=30, n=21, k=21, m=21, p=12, extra_dims=(7,))