diff --git a/cuda_helpers.cu b/cuda_helpers.cu
index e3f31cef90012991e9da772114b64e0d5d265504..83745456c7d6e102106c334cc8aa5693322e1250 100644
--- a/cuda_helpers.cu
+++ b/cuda_helpers.cu
@@ -210,6 +210,503 @@ cuda_error:
     return 1;
 }
 
+/**
+ * A is an upper triangular matrix.
+ * B is a lower triangular matrix.
+ * Both are in Fortran order (columns major) not unit triangular.
+ * The product AB overwrites A.
+ * Both matrices must have shape (size, size) and lie in device memory.
+ */
+cublasStatus_t multiply_UL_cuda_worker(
+        cublasHandle_t handle,
+        const int size,
+        complex_type_cuda *a,
+        const int a_dim0,
+        const complex_type_cuda *b,
+        const int b_dim0
+        )
+{
+#ifdef DEBUG
+    fprintf(stderr, "Starting multiply_UL_cuda_worker %d\n", size);
+#endif
+    static const complex_type_cuda one = make_cuDoubleComplex(1., 0.);
+    if (size < TRIANGULAR_OPTIMIZE_THRESHOLD_GPU)
+    {
+#ifdef DEBUG
+        fprintf(stderr, "multiply_UL_cuda_worker: final step %d\n", size);
+#endif
+        return cu_trmm(
+                handle,
+                CUBLAS_SIDE_RIGHT, // order: this means B := B A
+                CUBLAS_FILL_MODE_LOWER, // A is a lower triangular matrix
+                CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+                CUBLAS_DIAG_NON_UNIT, // A is not unit triangular
+                size, // rows of B (int)
+                size, // columns of B (int)
+                &one, // global prefactor
+                b, // matrix A
+                b_dim0, // first dimension of A (int)
+                a, // matrix B
+                a_dim0,  // first dimension of B (int)
+                a, // matrix B
+                a_dim0  // first dimension of B (int)
+                );
+    }
+    /*
+     * Matrices are labeled as follows:
+     *
+     * A = [ Aa  Ab ]  B = [ Ba  Bb ]
+     *     [ Ac  Ad ]      [ Bc  Bd ]
+     * Initially Ac = Bb = 0.
+     */
+    const int
+        part1 = size / 2,
+        part2 = size - part1;
+
+    /* Step 1: overwrite Ac with Ad Bc.
+     * This requires that first Bc is copied to Ac. */
+    a += part1;
+    b += part1;
+    int i = -1;
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda_worker: memcpy %d\n", size);
+#endif
+    while (++i<part1)
+        cudaMemcpy( a + i*a_dim0, b + i*b_dim0, part2 * sizeof(complex_type_cuda), cudaMemcpyDeviceToDevice );
+    a -= part1;
+    b -= part1;
+
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda_worker: step 1 %d\n", size);
+#endif
+    cublasStatus_t status = cu_trmm(
+            handle,
+            CUBLAS_SIDE_LEFT, // order: this means B := A B
+            CUBLAS_FILL_MODE_UPPER, // A is an upper triangular matrix
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            CUBLAS_DIAG_NON_UNIT, // A is not unit triangular
+            part2, // rows of B (int)
+            part1, // columns of B (int)
+            &one, // global prefactor
+            a + part1*(1 + a_dim0), // matrix A
+            a_dim0, // first dimension of A (int)
+            a + part1, // matrix B
+            a_dim0,  // first dimension of B (int)
+            a + part1, // matrix B
+            a_dim0  // first dimension of B (int)
+            );
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 2: overwrite Ad with Ad Bd */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda_worker: step 2 %d\n", size);
+#endif
+    status = multiply_UL_cuda_worker(handle, part2, a + (a_dim0+1)*part1, a_dim0, b + (b_dim0+1)*part1, b_dim0);
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 3: overwrite Aa with Aa Ba */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda_worker: step 3 %d\n", size);
+#endif
+    status = multiply_UL_cuda_worker(handle, part1, a, a_dim0, b, b_dim0);
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 4: add Ab Bc to Aa */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda_worker: step 4 %d\n", size);
+#endif
+    status = cu_gemm(
+            handle,
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            CUBLAS_OP_N, // B is not modified (no adjoint or transpose)
+            part1, // rows of A (int)
+            part1, // columns of B (int)
+            part2, // columns of A = rows of B (int)
+            &one, // global prefactor
+            a + part1*a_dim0, // matrix A
+            a_dim0, // first dimension of A (int)
+            b + part1, // matrix B
+            b_dim0,  // first dimension of B (int)
+            &one, // weight of C
+            a, // matrix C
+            a_dim0 // first dimension of C
+            );
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 5: overwrite Ab with Ab Bd */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda_worker: step 5 %d\n", size);
+#endif
+    status = cu_trmm(
+            handle,
+            CUBLAS_SIDE_RIGHT, // order: this means B := B A
+            CUBLAS_FILL_MODE_LOWER, // A is a lower triangular matrix
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            CUBLAS_DIAG_NON_UNIT, // A is not unit triangular
+            part1, // rows of B (int)
+            part2, // columns of B (int)
+            &one, // global prefactor
+            b + (1 + b_dim0)*part1, // matrix A
+            b_dim0, // first dimension of A (int)
+            a + part1*a_dim0, // matrix B
+            a_dim0,  // first dimension of B (int)
+            a + part1*a_dim0, // matrix B
+            a_dim0  // first dimension of B (int)
+            );
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda_worker: done %d\n", size);
+#endif
+    return status;
+}
+
+int multiply_UL_cuda(
+        const int size,
+        void *a,
+        const int a_dim0,
+        const void *b,
+        const int b_dim0
+        )
+{
+#ifdef DEBUG
+    fprintf(stderr, "Entering multiply_UL_cuda %d %d %d\n", size, a_dim0, b_dim0);
+#endif
+    if (a_dim0 < size || b_dim0 < size)
+        return 1;
+    complex_type_cuda *dev_a = NULL, *dev_b = NULL;
+    cudaError_t cuda_err;
+    cublasHandle_t handle;
+    cublasStatus_t status;
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda: allocating device memory\n");
+#endif
+    cuda_err = cudaMalloc(&dev_a, size*size*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: GPU memory allocation (matrix A)\n");
+        return 1;
+    }
+    cuda_err = cudaMalloc(&dev_b, size*size*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: GPU memory allocation (matrix B)\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda: creating handle\n");
+#endif
+    status = cublasCreate(&handle);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: cublas initialization\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda: copying data to device\n");
+#endif
+    status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), a, a_dim0, dev_a, size);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: set matrix (matrix A)\n");
+        goto cuda_error;
+    }
+    status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), b, b_dim0, dev_b, size);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: set matrix (matrix B)\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda: calling worker\n");
+#endif
+    if (multiply_UL_cuda_worker(handle, size, dev_a, size, dev_b, size))
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: worker\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda: copying data to host\n");
+#endif
+    status = cublasGetMatrix(size, size, sizeof(complex_type_cuda), dev_a, size, a, a_dim0);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_UL_cuda: get matrix\n");
+        goto cuda_error;
+    }
+
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda: cleaning up\n");
+#endif
+    cudaFree(dev_a);
+    cudaFree(dev_b);
+    cublasDestroy(handle);
+#ifdef DEBUG
+    fprintf(stderr, "multiply_UL_cuda: done\n");
+#endif
+    return 0;
+
+cuda_error:
+    if (dev_a)
+        cudaFree(dev_a);
+    if (dev_b)
+        cudaFree(dev_b);
+    if (dev_a && dev_b)
+        cublasDestroy(handle);
+    return 1;
+}
+
+/**
+ * A is an upper triangular matrix.
+ * B is a lower triangular matrix.
+ * Both are in Fortran order (columns major) not unit triangular.
+ * The product AB overwrites A.
+ * Both matrices must have shape (size, size) and lie in device memory.
+ */
+cublasStatus_t multiply_LU_cuda_worker(
+        cublasHandle_t handle,
+        const int size,
+        complex_type_cuda *a,
+        const int a_dim0,
+        const complex_type_cuda *b,
+        const int b_dim0
+        )
+{
+#ifdef DEBUG
+    fprintf(stderr, "Starting multiply_LU_cuda_worker %d\n", size);
+#endif
+    static const complex_type_cuda one = make_cuDoubleComplex(1., 0.);
+    if (size < TRIANGULAR_OPTIMIZE_THRESHOLD_GPU)
+    {
+#ifdef DEBUG
+        fprintf(stderr, "multiply_LU_cuda_worker: final step %d\n", size);
+#endif
+        return cu_trmm(
+                handle,
+                CUBLAS_SIDE_RIGHT, // order: this means B := B A
+                CUBLAS_FILL_MODE_UPPER, // A is an upper triangular matrix
+                CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+                CUBLAS_DIAG_NON_UNIT, // A is not unit triangular
+                size, // rows of B (int)
+                size, // columns of B (int)
+                &one, // global prefactor
+                b, // matrix A
+                b_dim0, // first dimension of A (int)
+                a, // matrix B
+                a_dim0,  // first dimension of B (int)
+                a, // matrix B
+                a_dim0  // first dimension of B (int)
+                );
+    }
+    /*
+     * Matrices are labeled as follows:
+     *
+     * A = [ Aa  Ab ]  B = [ Ba  Bb ]
+     *     [ Ac  Ad ]      [ Bc  Bd ]
+     * Initially Ac = Bb = 0.
+     */
+    const int
+        part1 = size / 2,
+        part2 = size - part1;
+
+    /* Step 1: overwrite Ab with Aa Bb.
+     * This requires that first Bb is copied to Ab. */
+    a += a_dim0*part1;
+    b += b_dim0*part1;
+    int i = -1;
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda_worker: memcpy %d\n", size);
+#endif
+    while (++i<part2)
+        cudaMemcpy( a + i*a_dim0, b + i*b_dim0, part1 * sizeof(complex_type_cuda), cudaMemcpyDeviceToDevice );
+    a -= a_dim0*part1;
+    b -= b_dim0*part1;
+
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda_worker: step 1 %d\n", size);
+#endif
+    cublasStatus_t status = cu_trmm(
+            handle,
+            CUBLAS_SIDE_LEFT, // order: this means B := A B
+            CUBLAS_FILL_MODE_LOWER, // A is a lower triangular matrix
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            CUBLAS_DIAG_NON_UNIT, // A is not unit triangular
+            part1, // rows of B (int)
+            part2, // columns of B (int)
+            &one, // global prefactor
+            a, // matrix A
+            a_dim0, // first dimension of A (int)
+            a + a_dim0*part1, // matrix B
+            a_dim0,  // first dimension of B (int)
+            a + a_dim0*part1, // matrix B
+            a_dim0  // first dimension of B (int)
+            );
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 2: overwrite Ad with Ad Bd */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda_worker: step 2 %d\n", size);
+#endif
+    status = multiply_LU_cuda_worker(handle, part1, a, a_dim0, b, b_dim0);
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 3: overwrite Aa with Aa Ba */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda_worker: step 3 %d\n", size);
+#endif
+    status = multiply_LU_cuda_worker(handle, part2, a + (a_dim0+1)*part1, a_dim0, b + (b_dim0+1)*part1, b_dim0);
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 4: add Ab Bc to Aa */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda_worker: step 4 %d\n", size);
+#endif
+    status = cu_gemm(
+            handle,
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            CUBLAS_OP_N, // B is not modified (no adjoint or transpose)
+            part2, // rows of A (int)
+            part2, // columns of B (int)
+            part1, // columns of A = rows of B (int)
+            &one, // global prefactor
+            a + part1, // matrix A
+            a_dim0, // first dimension of A (int)
+            b + part1*b_dim0, // matrix B
+            b_dim0,  // first dimension of B (int)
+            &one, // weight of C
+            a + (a_dim0+1)*part1, // matrix C
+            a_dim0 // first dimension of C
+            );
+    if (status != CUBLAS_STATUS_SUCCESS)
+        return status;
+
+    /* Step 5: overwrite Ab with Ab Bd */
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda_worker: step 5 %d\n", size);
+#endif
+    status = cu_trmm(
+            handle,
+            CUBLAS_SIDE_RIGHT, // order: this means B := B A
+            CUBLAS_FILL_MODE_UPPER, // A is an upper triangular matrix
+            CUBLAS_OP_N, // A is not modified (no adjoint or transpose)
+            CUBLAS_DIAG_NON_UNIT, // A is not unit triangular
+            part2, // rows of B (int)
+            part1, // columns of B (int)
+            &one, // global prefactor
+            b, // matrix A
+            b_dim0, // first dimension of A (int)
+            a + part1, // matrix B
+            a_dim0,  // first dimension of B (int)
+            a + part1, // matrix B
+            a_dim0  // first dimension of B (int)
+            );
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda_worker: done %d\n", size);
+#endif
+    return status;
+}
+
+int multiply_LU_cuda(
+        const int size,
+        void *a,
+        const int a_dim0,
+        const void *b,
+        const int b_dim0
+        )
+{
+#ifdef DEBUG
+    fprintf(stderr, "Entering multiply_LU_cuda %d %d %d\n", size, a_dim0, b_dim0);
+#endif
+    if (a_dim0 < size || b_dim0 < size)
+        return 1;
+    complex_type_cuda *dev_a = NULL, *dev_b = NULL;
+    cudaError_t cuda_err;
+    cublasHandle_t handle;
+    cublasStatus_t status;
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda: allocating device memory\n");
+#endif
+    cuda_err = cudaMalloc(&dev_a, size*size*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: GPU memory allocation (matrix A)\n");
+        return 1;
+    }
+    cuda_err = cudaMalloc(&dev_b, size*size*sizeof(complex_type_cuda));
+    if (cuda_err != cudaSuccess)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: GPU memory allocation (matrix B)\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda: creating handle\n");
+#endif
+    status = cublasCreate(&handle);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: cublas initialization\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda: copying data to device\n");
+#endif
+    status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), a, a_dim0, dev_a, size);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: set matrix (matrix A)\n");
+        goto cuda_error;
+    }
+    status = cublasSetMatrix(size, size, sizeof(complex_type_cuda), b, b_dim0, dev_b, size);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: set matrix (matrix B)\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda: calling worker\n");
+#endif
+    if (multiply_LU_cuda_worker(handle, size, dev_a, size, dev_b, size))
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: worker\n");
+        goto cuda_error;
+    }
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda: copying data to host\n");
+#endif
+    status = cublasGetMatrix(size, size, sizeof(complex_type_cuda), dev_a, size, a, a_dim0);
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        fprintf(stderr, "UNHANDLED ERROR in multiply_LU_cuda: get matrix\n");
+        goto cuda_error;
+    }
+
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda: cleaning up\n");
+#endif
+    cudaFree(dev_a);
+    cudaFree(dev_b);
+    cublasDestroy(handle);
+#ifdef DEBUG
+    fprintf(stderr, "multiply_LU_cuda: done\n");
+#endif
+    return 0;
+
+cuda_error:
+    if (dev_a)
+        cudaFree(dev_a);
+    if (dev_b)
+        cudaFree(dev_b);
+    if (dev_a && dev_b)
+        cublasDestroy(handle);
+    return 1;
+}
+
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/rtrg_cublas.c b/rtrg_cublas.c
index 1247ab051aa4312152461d568bd97040fc6c8dc4..1094434dc01906227a9ee7a8483fb27f3bfa9c01 100644
--- a/rtrg_cublas.c
+++ b/rtrg_cublas.c
@@ -71,6 +71,22 @@ extern char cuda_trmm(
         const int b_dim0
         );
 
+extern int multiply_UL_cuda(
+        const int size,
+        void *a,
+        const int a_dim0,
+        const void *b,
+        const int b_dim0
+        );
+
+extern int multiply_LU_cuda(
+        const int size,
+        void *a,
+        const int a_dim0,
+        const void *b,
+        const int b_dim0
+        );
+
 
 static const complex_type zero = 0.;
 static const complex_type one = 1.;
@@ -780,7 +796,7 @@ static int multiply_LU_inplace(
         trmm(
                 CblasColMajor, // layout
                 CblasRight, // order: this means B := B A
-                CblasUpper, // A is a lower triangular matrix
+                CblasUpper, // A is an upper triangular matrix
                 CblasNoTrans, // A is not modified (no adjoint or transpose)
                 CblasNonUnit, // A is not unit triangular
                 size, // rows of B (int)
@@ -823,7 +839,7 @@ static int multiply_LU_inplace(
      * This requires that first Bb is copied to Ab. */
     a += a_dim0*part1;
     b += b_dim0*part1;
-    int i=-1;
+    int i = -1;
     while (++i<part2)
         memcpy( a + i*a_dim0, b + i*b_dim0, part1 * sizeof(complex_type) );
     a -= a_dim0*part1;
@@ -1564,13 +1580,22 @@ static int multiply_extended_worker(
 #ifdef DEBUG
     fprintf(stderr, "Calling multiply_UL_inplace for extrapolated matrices\n");
 #endif
-    if (multiply_UL_inplace(
-            cutoff, // size
-            auxarr_left, // matrix A
-            aux_dim0,  // first dimension of A (int)
-            auxarr_right, // matrix B
-            aux_dim0 // first dimension of B (int)
-            ))
+    if (cutoff < LU_ON_GPU_THRESHOLD
+            ? multiply_UL_inplace(
+                cutoff, // size
+                auxarr_left, // matrix A
+                aux_dim0,  // first dimension of A (int)
+                auxarr_right, // matrix B
+                aux_dim0 // first dimension of B (int)
+                )
+            : multiply_UL_cuda(
+                cutoff, // size
+                auxarr_left, // matrix A
+                aux_dim0,  // first dimension of A (int)
+                auxarr_right, // matrix B
+                aux_dim0 // first dimension of B (int)
+                )
+            )
         return 1;
 
     /* Add result to top left of output array. */
@@ -1599,13 +1624,22 @@ static int multiply_extended_worker(
 #ifdef DEBUG
     fprintf(stderr, "Calling multiply_LU_inplace for extrapolated matrices\n");
 #endif
-    if (multiply_LU_inplace(
-            cutoff,
-            auxarr_left,
-            aux_dim0,
-            auxarr_right,
-            aux_dim0
-            ))
+    if (cutoff < LU_ON_GPU_THRESHOLD
+            ? multiply_LU_inplace(
+                cutoff,
+                auxarr_left,
+                aux_dim0,
+                auxarr_right,
+                aux_dim0
+                )
+            : multiply_LU_cuda(
+                cutoff,
+                auxarr_left,
+                aux_dim0,
+                auxarr_right,
+                aux_dim0
+                )
+            )
         return 1;
 
     /* Add result to bottom right of output array. */
@@ -2108,13 +2142,22 @@ static int multiply_symmetric_worker(
     if (i)
         return i;
 
-    if (multiply_UL_inplace(
-            cutoff,
-            auxmatrixl,
-            aux_dim0,
-            auxmatrixr,
-            aux_dim0
-            ))
+    if (cutoff < LU_ON_GPU_THRESHOLD
+            ? multiply_UL_inplace(
+                cutoff,
+                auxmatrixl,
+                aux_dim0,
+                auxmatrixr,
+                aux_dim0
+                )
+            : multiply_UL_cuda(
+                cutoff,
+                auxmatrixl,
+                aux_dim0,
+                auxmatrixr,
+                aux_dim0
+                )
+            )
         return 1;
 
     /* Add result to top left of output array. */
diff --git a/rtrg_cublas.h b/rtrg_cublas.h
index 7b26ec31832695584b71db612fda165c35b1a74b..1f7426cd8e6f49f71aed2f62da7046e2aebb9372 100644
--- a/rtrg_cublas.h
+++ b/rtrg_cublas.h
@@ -29,6 +29,8 @@
  * left matrix is also triangular. Otherwise the problem is recursively
  * subdivided. */
 #define TRIANGULAR_OPTIMIZE_THRESHOLD 128
+#define TRIANGULAR_OPTIMIZE_THRESHOLD_GPU 128
+#define LU_ON_GPU_THRESHOLD 768
 
 #define THRESHOLD_GEMM_GPU 512
 #define THRESHOLD_TRMM_GPU 768