diff --git a/cuda_helpers.cu b/cuda_helpers.cu index e3f31cef90012991e9da772114b64e0d5d265504..83745456c7d6e102106c334cc8aa5693322e1250 100644 --- a/cuda_helpers.cu +++ b/cuda_helpers.cu @@ -210,6 +210,503 @@ cuda_error: return 1; } +/** + * A is an upper triangular matrix. + * B is a lower triangular matrix. + * Both are in Fortran order (columns major) not unit triangular. + * The product AB overwrites A. + * Both matrices must have shape (size, size) and lie in device memory. + */ +cublasStatus_t multiply_UL_cuda_worker( + cublasHandle_t handle, + const int size, + complex_type_cuda *a, + const int a_dim0, + const complex_type_cuda *b, + const int b_dim0 + ) +{ +#ifdef DEBUG + fprintf(stderr, "Starting multiply_UL_cuda_worker %d\n", size); +#endif + static const complex_type_cuda one = make_cuDoubleComplex(1., 0.); + if (size < TRIANGULAR_OPTIMIZE_THRESHOLD_GPU) + { +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: final step %d\n", size); +#endif + return cu_trmm( + handle, + CUBLAS_SIDE_RIGHT, // order: this means B := B A + CUBLAS_FILL_MODE_LOWER, // A is a lower triangular matrix + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_DIAG_NON_UNIT, // A is not unit triangular + size, // rows of B (int) + size, // columns of B (int) + &one, // global prefactor + b, // matrix A + b_dim0, // first dimension of A (int) + a, // matrix B + a_dim0, // first dimension of B (int) + a, // matrix B + a_dim0 // first dimension of B (int) + ); + } + /* + * Matrices are labeled as follows: + * + * A = [ Aa Ab ] B = [ Ba Bb ] + * [ Ac Ad ] [ Bc Bd ] + * Initially Ac = Bb = 0. + */ + const int + part1 = size / 2, + part2 = size - part1; + + /* Step 1: overwrite Ac with Ad Bc. + * This requires that first Bc is copied to Ac. */ + a += part1; + b += part1; + int i = -1; +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: memcpy %d\n", size); +#endif + while (++i<part1) + cudaMemcpy( a + i*a_dim0, b + i*b_dim0, part2 * sizeof(complex_type_cuda), cudaMemcpyDeviceToDevice ); + a -= part1; + b -= part1; + +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: step 1 %d\n", size); +#endif + cublasStatus_t status = cu_trmm( + handle, + CUBLAS_SIDE_LEFT, // order: this means B := A B + CUBLAS_FILL_MODE_UPPER, // A is an upper triangular matrix + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_DIAG_NON_UNIT, // A is not unit triangular + part2, // rows of B (int) + part1, // columns of B (int) + &one, // global prefactor + a + part1*(1 + a_dim0), // matrix A + a_dim0, // first dimension of A (int) + a + part1, // matrix B + a_dim0, // first dimension of B (int) + a + part1, // matrix B + a_dim0 // first dimension of B (int) + ); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 2: overwrite Ad with Ad Bd */ +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: step 2 %d\n", size); +#endif + status = multiply_UL_cuda_worker(handle, part2, a + (a_dim0+1)*part1, a_dim0, b + (b_dim0+1)*part1, b_dim0); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 3: overwrite Aa with Aa Ba */ +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: step 3 %d\n", size); +#endif + status = multiply_UL_cuda_worker(handle, part1, a, a_dim0, b, b_dim0); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 4: add Ab Bc to Aa */ +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: step 4 %d\n", size); +#endif + status = cu_gemm( + handle, + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_OP_N, // B is not modified (no adjoint or transpose) + part1, // rows of A (int) + part1, // columns of B (int) + part2, // columns of A = rows of B (int) + &one, // global prefactor + a + part1*a_dim0, // matrix A + a_dim0, // first dimension of A (int) + b + part1, // matrix B + b_dim0, // first dimension of B (int) + &one, // weight of C + a, // matrix C + a_dim0 // first dimension of C + ); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 5: overwrite Ab with Ab Bd */ +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: step 5 %d\n", size); +#endif + status = cu_trmm( + handle, + CUBLAS_SIDE_RIGHT, // order: this means B := B A + CUBLAS_FILL_MODE_LOWER, // A is a lower triangular matrix + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_DIAG_NON_UNIT, // A is not unit triangular + part1, // rows of B (int) + part2, // columns of B (int) + &one, // global prefactor + b + (1 + b_dim0)*part1, // matrix A + b_dim0, // first dimension of A (int) + a + part1*a_dim0, // matrix B + a_dim0, // first dimension of B (int) + a + part1*a_dim0, // matrix B + a_dim0 // first dimension of B (int) + ); +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda_worker: done %d\n", size); +#endif + return status; +} + +int multiply_UL_cuda( + const int size, + void *a, + const int a_dim0, + const void *b, + const int b_dim0 + ) +{ +#ifdef DEBUG + fprintf(stderr, "Entering multiply_UL_cuda %d %d %d\n", size, a_dim0, b_dim0); +#endif + if (a_dim0 < size || b_dim0 < size) + return 1; + complex_type_cuda *dev_a = NULL, *dev_b = NULL; + cudaError_t cuda_err; + cublasHandle_t handle; + cublasStatus_t status; +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda: allocating device memory\n"); +#endif + cuda_err = cudaMalloc(&dev_a, size*size*sizeof(complex_type_cuda)); + if (cuda_err != cudaSuccess) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: GPU memory allocation (matrix A)\n"); + return 1; + } + cuda_err = cudaMalloc(&dev_b, size*size*sizeof(complex_type_cuda)); + if (cuda_err != cudaSuccess) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: GPU memory allocation (matrix B)\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda: creating handle\n"); +#endif + status = cublasCreate(&handle); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: cublas initialization\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda: copying data to device\n"); +#endif + status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), a, a_dim0, dev_a, size); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: set matrix (matrix A)\n"); + goto cuda_error; + } + status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), b, b_dim0, dev_b, size); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: set matrix (matrix B)\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda: calling worker\n"); +#endif + if (multiply_UL_cuda_worker(handle, size, dev_a, size, dev_b, size)) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: worker\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda: copying data to host\n"); +#endif + status = cublasGetMatrix(size, size, sizeof(complex_type_cuda), dev_a, size, a, a_dim0); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: get matrix\n"); + goto cuda_error; + } + +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda: cleaning up\n"); +#endif + cudaFree(dev_a); + cudaFree(dev_b); + cublasDestroy(handle); +#ifdef DEBUG + fprintf(stderr, "multiply_UL_cuda: done\n"); +#endif + return 0; + +cuda_error: + if (dev_a) + cudaFree(dev_a); + if (dev_b) + cudaFree(dev_b); + if (dev_a && dev_b) + cublasDestroy(handle); + return 1; +} + +/** + * A is an upper triangular matrix. + * B is a lower triangular matrix. + * Both are in Fortran order (columns major) not unit triangular. + * The product AB overwrites A. + * Both matrices must have shape (size, size) and lie in device memory. + */ +cublasStatus_t multiply_LU_cuda_worker( + cublasHandle_t handle, + const int size, + complex_type_cuda *a, + const int a_dim0, + const complex_type_cuda *b, + const int b_dim0 + ) +{ +#ifdef DEBUG + fprintf(stderr, "Starting multiply_LU_cuda_worker %d\n", size); +#endif + static const complex_type_cuda one = make_cuDoubleComplex(1., 0.); + if (size < TRIANGULAR_OPTIMIZE_THRESHOLD_GPU) + { +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: final step %d\n", size); +#endif + return cu_trmm( + handle, + CUBLAS_SIDE_RIGHT, // order: this means B := B A + CUBLAS_FILL_MODE_UPPER, // A is an upper triangular matrix + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_DIAG_NON_UNIT, // A is not unit triangular + size, // rows of B (int) + size, // columns of B (int) + &one, // global prefactor + b, // matrix A + b_dim0, // first dimension of A (int) + a, // matrix B + a_dim0, // first dimension of B (int) + a, // matrix B + a_dim0 // first dimension of B (int) + ); + } + /* + * Matrices are labeled as follows: + * + * A = [ Aa Ab ] B = [ Ba Bb ] + * [ Ac Ad ] [ Bc Bd ] + * Initially Ac = Bb = 0. + */ + const int + part1 = size / 2, + part2 = size - part1; + + /* Step 1: overwrite Ab with Aa Bb. + * This requires that first Bb is copied to Ab. */ + a += a_dim0*part1; + b += b_dim0*part1; + int i = -1; +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: memcpy %d\n", size); +#endif + while (++i<part2) + cudaMemcpy( a + i*a_dim0, b + i*b_dim0, part1 * sizeof(complex_type_cuda), cudaMemcpyDeviceToDevice ); + a -= a_dim0*part1; + b -= b_dim0*part1; + +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: step 1 %d\n", size); +#endif + cublasStatus_t status = cu_trmm( + handle, + CUBLAS_SIDE_LEFT, // order: this means B := A B + CUBLAS_FILL_MODE_LOWER, // A is a lower triangular matrix + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_DIAG_NON_UNIT, // A is not unit triangular + part1, // rows of B (int) + part2, // columns of B (int) + &one, // global prefactor + a, // matrix A + a_dim0, // first dimension of A (int) + a + a_dim0*part1, // matrix B + a_dim0, // first dimension of B (int) + a + a_dim0*part1, // matrix B + a_dim0 // first dimension of B (int) + ); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 2: overwrite Ad with Ad Bd */ +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: step 2 %d\n", size); +#endif + status = multiply_LU_cuda_worker(handle, part1, a, a_dim0, b, b_dim0); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 3: overwrite Aa with Aa Ba */ +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: step 3 %d\n", size); +#endif + status = multiply_LU_cuda_worker(handle, part2, a + (a_dim0+1)*part1, a_dim0, b + (b_dim0+1)*part1, b_dim0); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 4: add Ab Bc to Aa */ +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: step 4 %d\n", size); +#endif + status = cu_gemm( + handle, + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_OP_N, // B is not modified (no adjoint or transpose) + part2, // rows of A (int) + part2, // columns of B (int) + part1, // columns of A = rows of B (int) + &one, // global prefactor + a + part1, // matrix A + a_dim0, // first dimension of A (int) + b + part1*b_dim0, // matrix B + b_dim0, // first dimension of B (int) + &one, // weight of C + a + (a_dim0+1)*part1, // matrix C + a_dim0 // first dimension of C + ); + if (status != CUBLAS_STATUS_SUCCESS) + return status; + + /* Step 5: overwrite Ab with Ab Bd */ +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: step 5 %d\n", size); +#endif + status = cu_trmm( + handle, + CUBLAS_SIDE_RIGHT, // order: this means B := B A + CUBLAS_FILL_MODE_UPPER, // A is an upper triangular matrix + CUBLAS_OP_N, // A is not modified (no adjoint or transpose) + CUBLAS_DIAG_NON_UNIT, // A is not unit triangular + part2, // rows of B (int) + part1, // columns of B (int) + &one, // global prefactor + b, // matrix A + b_dim0, // first dimension of A (int) + a + part1, // matrix B + a_dim0, // first dimension of B (int) + a + part1, // matrix B + a_dim0 // first dimension of B (int) + ); +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda_worker: done %d\n", size); +#endif + return status; +} + +int multiply_LU_cuda( + const int size, + void *a, + const int a_dim0, + const void *b, + const int b_dim0 + ) +{ +#ifdef DEBUG + fprintf(stderr, "Entering multiply_LU_cuda %d %d %d\n", size, a_dim0, b_dim0); +#endif + if (a_dim0 < size || b_dim0 < size) + return 1; + complex_type_cuda *dev_a = NULL, *dev_b = NULL; + cudaError_t cuda_err; + cublasHandle_t handle; + cublasStatus_t status; +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda: allocating device memory\n"); +#endif + cuda_err = cudaMalloc(&dev_a, size*size*sizeof(complex_type_cuda)); + if (cuda_err != cudaSuccess) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: GPU memory allocation (matrix A)\n"); + return 1; + } + cuda_err = cudaMalloc(&dev_b, size*size*sizeof(complex_type_cuda)); + if (cuda_err != cudaSuccess) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: GPU memory allocation (matrix B)\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda: creating handle\n"); +#endif + status = cublasCreate(&handle); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: cublas initialization\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda: copying data to device\n"); +#endif + status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), a, a_dim0, dev_a, size); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: set matrix (matrix A)\n"); + goto cuda_error; + } + status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), b, b_dim0, dev_b, size); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: set matrix (matrix B)\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda: calling worker\n"); +#endif + if (multiply_LU_cuda_worker(handle, size, dev_a, size, dev_b, size)) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: worker\n"); + goto cuda_error; + } +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda: copying data to host\n"); +#endif + status = cublasGetMatrix(size, size, sizeof(complex_type_cuda), dev_a, size, a, a_dim0); + if (status != CUBLAS_STATUS_SUCCESS) + { + fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: get matrix\n"); + goto cuda_error; + } + +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda: cleaning up\n"); +#endif + cudaFree(dev_a); + cudaFree(dev_b); + cublasDestroy(handle); +#ifdef DEBUG + fprintf(stderr, "multiply_LU_cuda: done\n"); +#endif + return 0; + +cuda_error: + if (dev_a) + cudaFree(dev_a); + if (dev_b) + cudaFree(dev_b); + if (dev_a && dev_b) + cublasDestroy(handle); + return 1; +} + + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/rtrg_cublas.c b/rtrg_cublas.c index 1247ab051aa4312152461d568bd97040fc6c8dc4..1094434dc01906227a9ee7a8483fb27f3bfa9c01 100644 --- a/rtrg_cublas.c +++ b/rtrg_cublas.c @@ -71,6 +71,22 @@ extern char cuda_trmm( const int b_dim0 ); +extern int multiply_UL_cuda( + const int size, + void *a, + const int a_dim0, + const void *b, + const int b_dim0 + ); + +extern int multiply_LU_cuda( + const int size, + void *a, + const int a_dim0, + const void *b, + const int b_dim0 + ); + static const complex_type zero = 0.; static const complex_type one = 1.; @@ -780,7 +796,7 @@ static int multiply_LU_inplace( trmm( CblasColMajor, // layout CblasRight, // order: this means B := B A - CblasUpper, // A is a lower triangular matrix + CblasUpper, // A is an upper triangular matrix CblasNoTrans, // A is not modified (no adjoint or transpose) CblasNonUnit, // A is not unit triangular size, // rows of B (int) @@ -823,7 +839,7 @@ static int multiply_LU_inplace( * This requires that first Bb is copied to Ab. */ a += a_dim0*part1; b += b_dim0*part1; - int i=-1; + int i = -1; while (++i<part2) memcpy( a + i*a_dim0, b + i*b_dim0, part1 * sizeof(complex_type) ); a -= a_dim0*part1; @@ -1564,13 +1580,22 @@ static int multiply_extended_worker( #ifdef DEBUG fprintf(stderr, "Calling multiply_UL_inplace for extrapolated matrices\n"); #endif - if (multiply_UL_inplace( - cutoff, // size - auxarr_left, // matrix A - aux_dim0, // first dimension of A (int) - auxarr_right, // matrix B - aux_dim0 // first dimension of B (int) - )) + if (cutoff < LU_ON_GPU_THRESHOLD + ? multiply_UL_inplace( + cutoff, // size + auxarr_left, // matrix A + aux_dim0, // first dimension of A (int) + auxarr_right, // matrix B + aux_dim0 // first dimension of B (int) + ) + : multiply_UL_cuda( + cutoff, // size + auxarr_left, // matrix A + aux_dim0, // first dimension of A (int) + auxarr_right, // matrix B + aux_dim0 // first dimension of B (int) + ) + ) return 1; /* Add result to top left of output array. */ @@ -1599,13 +1624,22 @@ static int multiply_extended_worker( #ifdef DEBUG fprintf(stderr, "Calling multiply_LU_inplace for extrapolated matrices\n"); #endif - if (multiply_LU_inplace( - cutoff, - auxarr_left, - aux_dim0, - auxarr_right, - aux_dim0 - )) + if (cutoff < LU_ON_GPU_THRESHOLD + ? multiply_LU_inplace( + cutoff, + auxarr_left, + aux_dim0, + auxarr_right, + aux_dim0 + ) + : multiply_LU_cuda( + cutoff, + auxarr_left, + aux_dim0, + auxarr_right, + aux_dim0 + ) + ) return 1; /* Add result to bottom right of output array. */ @@ -2108,13 +2142,22 @@ static int multiply_symmetric_worker( if (i) return i; - if (multiply_UL_inplace( - cutoff, - auxmatrixl, - aux_dim0, - auxmatrixr, - aux_dim0 - )) + if (cutoff < LU_ON_GPU_THRESHOLD + ? multiply_UL_inplace( + cutoff, + auxmatrixl, + aux_dim0, + auxmatrixr, + aux_dim0 + ) + : multiply_UL_cuda( + cutoff, + auxmatrixl, + aux_dim0, + auxmatrixr, + aux_dim0 + ) + ) return 1; /* Add result to top left of output array. */ diff --git a/rtrg_cublas.h b/rtrg_cublas.h index 7b26ec31832695584b71db612fda165c35b1a74b..1f7426cd8e6f49f71aed2f62da7046e2aebb9372 100644 --- a/rtrg_cublas.h +++ b/rtrg_cublas.h @@ -29,6 +29,8 @@ * left matrix is also triangular. Otherwise the problem is recursively * subdivided. */ #define TRIANGULAR_OPTIMIZE_THRESHOLD 128 +#define TRIANGULAR_OPTIMIZE_THRESHOLD_GPU 128 +#define LU_ON_GPU_THRESHOLD 768 #define THRESHOLD_GEMM_GPU 512 #define THRESHOLD_TRMM_GPU 768