new: CUBLAS version of rtrg_c

091b6194 · Valentin Bruch · 582a2179 · 091b6194 · 091b6194 · 091b6194
Commit 091b6194 authored May 6, 2021 by Valentin Bruch
--- a/cuda_helpers.cu
+++ b/cuda_helpers.cu
+#include <stdio.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuComplex.h>
+#include <complex.h>
+#include "rtrg_cublas.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+char cuda_gemm(
+        const int nrowl,
+        const int ncolr,
+        const int ncoll,
+        const void *prefactor,
+        const void *left,
+        const int left_dim0,
+        const void *right,
+        const int right_dim0,
+        const void *weight_c,
+        void *output,
+        const int out_dim0
+        )
+{
+    complex_type_cuda *dev_left = NULL, *dev_right = NULL, *dev_out = NULL;
+    cudaError_t cuda_err;
+    cublasHandle_t handle;
+    cublasStatus_t status;
+    cuda_err = cudaMalloc(&dev_left, nrowl*ncoll*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: GPU memory allocation (left matrix)\n");
+        return 1;
+    }
+    cuda_err = cudaMalloc(&dev_right, ncoll*ncolr*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: GPU memory allocation (right matrix)\n");
+        goto cuda_error;
+    }
+    cuda_err = cudaMalloc(&dev_out, nrowl*ncolr*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: GPU memory allocation (output matrix)\n");
+        goto cuda_error;
+    }
+    status = cublasCreate(&handle);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: cublas initialization\n");
+        goto cuda_error;
+    }
+    status = cublasSetMatrix(nrowl, ncoll, sizeof(complex_type_cuda), left, left_dim0, dev_left, nrowl);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: set matrix (matrix A) %d %d %d %d\n", nrowl, ncoll, left_dim0, nrowl);
+        goto cuda_error;
+    }
+    status = cublasSetMatrix(ncoll, ncolr, sizeof(complex_type_cuda), right, right_dim0, dev_right, ncoll);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: set matrix (matrix B) %d %d %d %d\n", ncoll, ncolr, right_dim0, ncoll);
+        goto cuda_error;
+    }
+    /* Ugly implementation: cast to complex_type instead of complex_type_cuda because it works */
+    if (*((const complex_type*) weight_c) != 0.)
+    {
+        status = cublasSetMatrix(nrowl, ncolr, sizeof(complex_type_cuda), output, out_dim0, dev_out, nrowl);
+        if (status != CUBLAS_STATUS_SUCCESS)
+        {
+            fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: set matrix (matrix C) %d %d %d %d\n", nrowl, ncolr, out_dim0, nrowl);
+            goto cuda_error;
+        }
+    }
+
+    status = cu_gemm(
+            handle,
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            CUBLAS_OP_N, // B is not modified (no adjoint or transpose)
+            nrowl, // rows of A (int)
+            ncolr, // columns of B (int)
+            ncoll, // columns of A = rows of B (int)
+            (const complex_type_cuda*) prefactor, // global prefactor
+            dev_left, // matrix A
+            nrowl, // first dimension of A (int)
+            dev_right, // matrix B
+            ncoll,  // first dimension of B (int)
+            (const complex_type_cuda*) weight_c, // weight of C
+            dev_out, // matrix C
+            nrowl // first dimension of C
+            );
+    cudaFree(dev_left);
+    cudaFree(dev_right);
+    if (status == CUBLAS_STATUS_SUCCESS)
+        status = cublasGetMatrix(nrowl, ncolr, sizeof(complex_type_cuda), dev_out, nrowl, output, out_dim0);
+    else
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: gemm\n");
+    cudaFree(dev_out);
+    cublasDestroy(handle);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: gemm or get matrix\n");
+        return 2;
+    }
+
+    return 0;
+
+cuda_error:
+    if (dev_left)
+        cudaFree(dev_left);
+    if (dev_right)
+        cudaFree(dev_right);
+    if (dev_out)
+        cudaFree(dev_out);
+    /* handles is only created if memory for all arrays was allocated successfully */
+    if (dev_left && dev_right && dev_out)
+        cublasDestroy(handle);
+    return 1;
+}
+
+
+char cuda_trmm(
+        const int flags,
+        const int nrowb,
+        const int ncolb,
+        const void *prefactor,
+        const void *a,
+        const int a_dim0,
+        void *b,
+        const int b_dim0
+        )
+{
+    complex_type_cuda *dev_a = NULL, *dev_b = NULL;
+    cudaError_t cuda_err;
+    cublasHandle_t handle;
+    cublasStatus_t status;
+    const int sizea = (flags & LeftMultiplication) ? nrowb : ncolb;
+    cuda_err = cudaMalloc(&dev_a, sizea*sizea*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: GPU memory allocation (matrix A)\n");
+        return 1;
+    }
+    cuda_err = cudaMalloc(&dev_b, ncolb*nrowb*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: GPU memory allocation (matrix B)\n");
+        goto cuda_error;
+    }
+    status = cublasCreate(&handle);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: cublas initialization\n");
+        goto cuda_error;
+    }
+    status = cublasSetMatrix(sizea, sizea, sizeof(complex_type_cuda), a, a_dim0, dev_a, sizea);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: set matrix (matrix A)\n");
+        goto cuda_error;
+    }
+    status = cublasSetMatrix(nrowb, ncolb, sizeof(complex_type_cuda), b, b_dim0, dev_b, nrowb);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: set matrix (matrix B)\n");
+        goto cuda_error;
+    }
+
+    status = cu_trmm(
+            handle,
+            (flags & LeftMultiplication) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT, // order AB or BA
+            (flags & UpperTriangular) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER, // A is upper or lower triangular matrix
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            (flags & UnitTriangular) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT, // A is usually not unit triangular
+            nrowb, // rows of B (int)
+            ncolb, // columns of B (int)
+            (const complex_type_cuda*) prefactor, // global prefactor
+            dev_a, // matrix A
+            sizea, // first dimension of A (int)
+            dev_b, // matrix B (input)
+            nrowb,  // first dimension of B (int)
+            dev_b, // matrix B (output)
+            nrowb  // first dimension of B (int)
+            );
+
+    cudaFree(dev_a);
+    if (status == CUBLAS_STATUS_SUCCESS)
+        status = cublasGetMatrix(nrowb, ncolb, sizeof(complex_type_cuda), dev_b, nrowb, b, b_dim0);
+    else
+        fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: trmm\n");
+    cudaFree(dev_b);
+    cublasDestroy(handle);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: trmm or get matrix\n");
+        return 2;
+    }
+
+    return 0;
+
+cuda_error:
+    if (dev_a)
+        cudaFree(dev_a);
+    if (dev_b)
+        cudaFree(dev_b);
+    /* handles is only created if memory for all arrays was allocated successfully */
+    if (dev_a && dev_b)
+        cublasDestroy(handle);
+    return 1;
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
--- a/rtrg_cublas.c
+++ b/rtrg_cublas.c
--- a/rtrg_cublas.h
+++ b/rtrg_cublas.h
+/* CONFIGURATION
+ * The following variables can be defined:
+ *   CBLAS: if defined, use CBLAS instead of directly calling BLAS functions
+ *   LAPACK_C: include LAPACK C header instead of just linking to LAPACK
+ *   PARALLEL_EXTRA_DIMS: use openmp to parallelize repeated operations over
+ *          extra dimensions of arrays. Note that internal parallelization of
+ *          BLAS functions might be faster.
+ *   DEBUG: print debugging information to stderr. This is neither complete
+ *          nor really useful.
+ * The following macros can be redefined to optimize performance, adapt to your
+ * BLAS and LAPACK installation, and adapt to the concrete mathematical problem.
+ *   TRIANGULAR_OPTIMIZE_THRESHOLD: Threshold for subdividing multiplication
+ *          of two triangular matrices (see below).
+ *   extrapolate: function for extrapolation of unknown matrix elements
+ *   complex_type, NPY_COMPLEX_TYPE: data type or basically everything
+ *   gemm, trmm: (C)BLAS function names (not CUDA!), need to be adapted to complex_type.
+ *   getrf, getri: LAPACK function names, need to be adapted to complex_type.
+ */
+
+
+/* Threshold for subdividing multiplication of two triangular matrices
+ * (multiply_LU_inplace and multiply_UL_inplace).
+ * The optimal value for this probably depends on the parallelization
+ * used in BLAS functions. When using a GPU for matrix multiplication,
+ * you should probably choose a large value here. Be aware that the
+ * functions implemented here may be slow on a GPU.
+ * If a matrix is smaller (less rows/columns) than this threshold, then
+ * trmm from BLAS is used directly, which discards the fact that the
+ * left matrix is also triangular. Otherwise the problem is recursively
+ * subdivided. */
+#define TRIANGULAR_OPTIMIZE_THRESHOLD 128
+
+#define THRESHOLD_GEMM_GPU 512
+#define THRESHOLD_TRMM_GPU 768
+
+/* Simple linear extrapolation based on the last 3 elements.
+ * Given the mapping {0:a, -1:b, -2:c} estimate the value at i. */
+#define extrapolate(i, a, b, c) ((1 + 0.75*i)*(a) - 0.5*i*((b) + 0.5*(c)))
+
+
+/* NOTE: complex_type and complex_type_cuda must be equivalent! */
+#define complex_type complex double
+#define complex_type_cuda cuDoubleComplex
+#define NPY_COMPLEX_TYPE NPY_COMPLEX128
+#define lapack_complex_double complex double
+
+#define cu_gemm cublasZgemm
+#define cu_trmm cublasZtrmm
+
+#ifdef LAPACK_C
+#define getrf LAPACK_zgetrf
+#define getri LAPACK_zgetri
+#else /* LAPACK_C */
+#define getrf zgetrf_
+#define getri zgetri_
+#endif /* LAPACK_C */
+
+#ifdef CBLAS
+#define gemm cblas_zgemm
+#define trmm cblas_ztrmm
+#else /* CBLAS */
+#define gemm zgemm_
+#define trmm ztrmm_
+#endif /* CBLAS */
+
+
+enum
+{
+    UpperTriangular = 1 << 0,
+    LeftMultiplication = 1 << 1,
+    UnitTriangular = 1 << 2,
+};
--- a/rtrg_cublas.makefile
+++ b/rtrg_cublas.makefile
+#!/bin/make -f
+all: libcuda_helpers.a build
+
+libcuda_helpers.a: cuda_helpers.cu rtrg_cublas.h
+	rm -f libcuda_helpers.a
+	CC=icc nvcc -rdc=true --compiler-options '-fPIC' -c -o temp.o cuda_helpers.cu
+	LDSHARED=icc nvcc -dlink --compiler-options '-fPIC' -o cuda_helpers.o temp.o -lcublas -lcudart
+	ar crs libcuda_helpers.a cuda_helpers.o temp.o
+
+build: libcuda_helpers.a rtrg_cublas.c rtrg_cublas.h
+	python setup_cublas.py build_ext --inplace
--- a/setup_cublas.py
+++ b/setup_cublas.py
+# build with:
+# LDSHARED=icc python3 setup.py build_ext --inplace
+
+from distutils.core import setup, Extension
+import numpy as np
+from os import environ
+
+compiler_args = ['-O3','-Wall','-Wextra','-std=c11']
+linker_args = []
+include_dirs = [np.get_include(),'.']
+name = 'rtrg_c'
+sources = ['rtrg_cublas.c']
+
+include_dirs += [environ.get('CUDA_PATH')+'/include']
+
+if environ.get('DEBUG', ''):
+    print('using -DDEBUG')
+    compiler_args += ['-DDEBUG']
+
+if environ.get('PARALLEL_EXTRA_DIMS', ''):
+    print('using -DPARALLEL_EXTRA_DIMS')
+    compiler_args += ['-fopenmp','-DPARALLEL_EXTRA_DIMS']
+    linker_args += ['-fopenmp']
+
+#linker_args += ['-lcblas']
+#linker_args += ['-L.','-lcuda_helpers','-lmkl_rt','-L'+environ.get('CUDA_PATH')+'/lib64','-lcublas']
+library_dirs = ['.', environ.get('CUDA_PATH')+'/lib64']
+libraries = ['cuda_helpers','cublas','cudart','mkl_rt']
+
+rtrg_c = Extension(name, sources=sources, include_dirs=include_dirs, libraries=libraries, library_dirs=library_dirs, extra_compile_args=compiler_args, extra_link_args=linker_args)
+setup(ext_modules=[rtrg_c])