Skip to content
Snippets Groups Projects
Commit 091b6194 authored by Valentin Bruch's avatar Valentin Bruch
Browse files

new: CUBLAS version of rtrg_c

parent 582a2179
No related branches found
No related tags found
No related merge requests found
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#include <complex.h>
#include "rtrg_cublas.h"
#ifdef __cplusplus
extern "C" {
#endif
char cuda_gemm(
const int nrowl,
const int ncolr,
const int ncoll,
const void *prefactor,
const void *left,
const int left_dim0,
const void *right,
const int right_dim0,
const void *weight_c,
void *output,
const int out_dim0
)
{
complex_type_cuda *dev_left = NULL, *dev_right = NULL, *dev_out = NULL;
cudaError_t cuda_err;
cublasHandle_t handle;
cublasStatus_t status;
cuda_err = cudaMalloc(&dev_left, nrowl*ncoll*sizeof(complex_type_cuda));
if (cuda_err != cudaSuccess)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: GPU memory allocation (left matrix)\n");
return 1;
}
cuda_err = cudaMalloc(&dev_right, ncoll*ncolr*sizeof(complex_type_cuda));
if (cuda_err != cudaSuccess)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: GPU memory allocation (right matrix)\n");
goto cuda_error;
}
cuda_err = cudaMalloc(&dev_out, nrowl*ncolr*sizeof(complex_type_cuda));
if (cuda_err != cudaSuccess)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: GPU memory allocation (output matrix)\n");
goto cuda_error;
}
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: cublas initialization\n");
goto cuda_error;
}
status = cublasSetMatrix(nrowl, ncoll, sizeof(complex_type_cuda), left, left_dim0, dev_left, nrowl);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: set matrix (matrix A) %d %d %d %d\n", nrowl, ncoll, left_dim0, nrowl);
goto cuda_error;
}
status = cublasSetMatrix(ncoll, ncolr, sizeof(complex_type_cuda), right, right_dim0, dev_right, ncoll);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: set matrix (matrix B) %d %d %d %d\n", ncoll, ncolr, right_dim0, ncoll);
goto cuda_error;
}
/* Ugly implementation: cast to complex_type instead of complex_type_cuda because it works */
if (*((const complex_type*) weight_c) != 0.)
{
status = cublasSetMatrix(nrowl, ncolr, sizeof(complex_type_cuda), output, out_dim0, dev_out, nrowl);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: set matrix (matrix C) %d %d %d %d\n", nrowl, ncolr, out_dim0, nrowl);
goto cuda_error;
}
}
status = cu_gemm(
handle,
CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
CUBLAS_OP_N, // B is not modified (no adjoint or transpose)
nrowl, // rows of A (int)
ncolr, // columns of B (int)
ncoll, // columns of A = rows of B (int)
(const complex_type_cuda*) prefactor, // global prefactor
dev_left, // matrix A
nrowl, // first dimension of A (int)
dev_right, // matrix B
ncoll, // first dimension of B (int)
(const complex_type_cuda*) weight_c, // weight of C
dev_out, // matrix C
nrowl // first dimension of C
);
cudaFree(dev_left);
cudaFree(dev_right);
if (status == CUBLAS_STATUS_SUCCESS)
status = cublasGetMatrix(nrowl, ncolr, sizeof(complex_type_cuda), dev_out, nrowl, output, out_dim0);
else
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: gemm\n");
cudaFree(dev_out);
cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_gemm: gemm or get matrix\n");
return 2;
}
return 0;
cuda_error:
if (dev_left)
cudaFree(dev_left);
if (dev_right)
cudaFree(dev_right);
if (dev_out)
cudaFree(dev_out);
/* handles is only created if memory for all arrays was allocated successfully */
if (dev_left && dev_right && dev_out)
cublasDestroy(handle);
return 1;
}
char cuda_trmm(
const int flags,
const int nrowb,
const int ncolb,
const void *prefactor,
const void *a,
const int a_dim0,
void *b,
const int b_dim0
)
{
complex_type_cuda *dev_a = NULL, *dev_b = NULL;
cudaError_t cuda_err;
cublasHandle_t handle;
cublasStatus_t status;
const int sizea = (flags & LeftMultiplication) ? nrowb : ncolb;
cuda_err = cudaMalloc(&dev_a, sizea*sizea*sizeof(complex_type_cuda));
if (cuda_err != cudaSuccess)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: GPU memory allocation (matrix A)\n");
return 1;
}
cuda_err = cudaMalloc(&dev_b, ncolb*nrowb*sizeof(complex_type_cuda));
if (cuda_err != cudaSuccess)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: GPU memory allocation (matrix B)\n");
goto cuda_error;
}
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: cublas initialization\n");
goto cuda_error;
}
status = cublasSetMatrix(sizea, sizea, sizeof(complex_type_cuda), a, a_dim0, dev_a, sizea);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: set matrix (matrix A)\n");
goto cuda_error;
}
status = cublasSetMatrix(nrowb, ncolb, sizeof(complex_type_cuda), b, b_dim0, dev_b, nrowb);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: set matrix (matrix B)\n");
goto cuda_error;
}
status = cu_trmm(
handle,
(flags & LeftMultiplication) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT, // order AB or BA
(flags & UpperTriangular) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER, // A is upper or lower triangular matrix
CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
(flags & UnitTriangular) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT, // A is usually not unit triangular
nrowb, // rows of B (int)
ncolb, // columns of B (int)
(const complex_type_cuda*) prefactor, // global prefactor
dev_a, // matrix A
sizea, // first dimension of A (int)
dev_b, // matrix B (input)
nrowb, // first dimension of B (int)
dev_b, // matrix B (output)
nrowb // first dimension of B (int)
);
cudaFree(dev_a);
if (status == CUBLAS_STATUS_SUCCESS)
status = cublasGetMatrix(nrowb, ncolb, sizeof(complex_type_cuda), dev_b, nrowb, b, b_dim0);
else
fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: trmm\n");
cudaFree(dev_b);
cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "UNHANDLED ERROR in cuda_trmm: trmm or get matrix\n");
return 2;
}
return 0;
cuda_error:
if (dev_a)
cudaFree(dev_a);
if (dev_b)
cudaFree(dev_b);
/* handles is only created if memory for all arrays was allocated successfully */
if (dev_a && dev_b)
cublasDestroy(handle);
return 1;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
This diff is collapsed.
/* CONFIGURATION
* The following variables can be defined:
* CBLAS: if defined, use CBLAS instead of directly calling BLAS functions
* LAPACK_C: include LAPACK C header instead of just linking to LAPACK
* PARALLEL_EXTRA_DIMS: use openmp to parallelize repeated operations over
* extra dimensions of arrays. Note that internal parallelization of
* BLAS functions might be faster.
* DEBUG: print debugging information to stderr. This is neither complete
* nor really useful.
* The following macros can be redefined to optimize performance, adapt to your
* BLAS and LAPACK installation, and adapt to the concrete mathematical problem.
* TRIANGULAR_OPTIMIZE_THRESHOLD: Threshold for subdividing multiplication
* of two triangular matrices (see below).
* extrapolate: function for extrapolation of unknown matrix elements
* complex_type, NPY_COMPLEX_TYPE: data type or basically everything
* gemm, trmm: (C)BLAS function names (not CUDA!), need to be adapted to complex_type.
* getrf, getri: LAPACK function names, need to be adapted to complex_type.
*/
/* Threshold for subdividing multiplication of two triangular matrices
* (multiply_LU_inplace and multiply_UL_inplace).
* The optimal value for this probably depends on the parallelization
* used in BLAS functions. When using a GPU for matrix multiplication,
* you should probably choose a large value here. Be aware that the
* functions implemented here may be slow on a GPU.
* If a matrix is smaller (less rows/columns) than this threshold, then
* trmm from BLAS is used directly, which discards the fact that the
* left matrix is also triangular. Otherwise the problem is recursively
* subdivided. */
#define TRIANGULAR_OPTIMIZE_THRESHOLD 128
#define THRESHOLD_GEMM_GPU 512
#define THRESHOLD_TRMM_GPU 768
/* Simple linear extrapolation based on the last 3 elements.
* Given the mapping {0:a, -1:b, -2:c} estimate the value at i. */
#define extrapolate(i, a, b, c) ((1 + 0.75*i)*(a) - 0.5*i*((b) + 0.5*(c)))
/* NOTE: complex_type and complex_type_cuda must be equivalent! */
#define complex_type complex double
#define complex_type_cuda cuDoubleComplex
#define NPY_COMPLEX_TYPE NPY_COMPLEX128
#define lapack_complex_double complex double
#define cu_gemm cublasZgemm
#define cu_trmm cublasZtrmm
#ifdef LAPACK_C
#define getrf LAPACK_zgetrf
#define getri LAPACK_zgetri
#else /* LAPACK_C */
#define getrf zgetrf_
#define getri zgetri_
#endif /* LAPACK_C */
#ifdef CBLAS
#define gemm cblas_zgemm
#define trmm cblas_ztrmm
#else /* CBLAS */
#define gemm zgemm_
#define trmm ztrmm_
#endif /* CBLAS */
enum
{
UpperTriangular = 1 << 0,
LeftMultiplication = 1 << 1,
UnitTriangular = 1 << 2,
};
#!/bin/make -f
all: libcuda_helpers.a build
libcuda_helpers.a: cuda_helpers.cu rtrg_cublas.h
rm -f libcuda_helpers.a
CC=icc nvcc -rdc=true --compiler-options '-fPIC' -c -o temp.o cuda_helpers.cu
LDSHARED=icc nvcc -dlink --compiler-options '-fPIC' -o cuda_helpers.o temp.o -lcublas -lcudart
ar crs libcuda_helpers.a cuda_helpers.o temp.o
build: libcuda_helpers.a rtrg_cublas.c rtrg_cublas.h
python setup_cublas.py build_ext --inplace
# build with:
# LDSHARED=icc python3 setup.py build_ext --inplace
from distutils.core import setup, Extension
import numpy as np
from os import environ
compiler_args = ['-O3','-Wall','-Wextra','-std=c11']
linker_args = []
include_dirs = [np.get_include(),'.']
name = 'rtrg_c'
sources = ['rtrg_cublas.c']
include_dirs += [environ.get('CUDA_PATH')+'/include']
if environ.get('DEBUG', ''):
print('using -DDEBUG')
compiler_args += ['-DDEBUG']
if environ.get('PARALLEL_EXTRA_DIMS', ''):
print('using -DPARALLEL_EXTRA_DIMS')
compiler_args += ['-fopenmp','-DPARALLEL_EXTRA_DIMS']
linker_args += ['-fopenmp']
#linker_args += ['-lcblas']
#linker_args += ['-L.','-lcuda_helpers','-lmkl_rt','-L'+environ.get('CUDA_PATH')+'/lib64','-lcublas']
library_dirs = ['.', environ.get('CUDA_PATH')+'/lib64']
libraries = ['cuda_helpers','cublas','cudart','mkl_rt']
rtrg_c = Extension(name, sources=sources, include_dirs=include_dirs, libraries=libraries, library_dirs=library_dirs, extra_compile_args=compiler_args, extra_link_args=linker_args)
setup(ext_modules=[rtrg_c])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment